Skip to content

Commit 4e1c620

Browse files
authored
Merge pull request #32 from simdutf/cleaning_fixed
Some cleaning and optimization
2 parents 1abd80d + 50f00bb commit 4e1c620

File tree

12 files changed

+206
-184
lines changed

12 files changed

+206
-184
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ as a reference (`System.Buffers.Text.Base64.DecodeFromUtf8`).
2727

2828
| processor | SimdBase64(GB/s) | .NET speed (GB/s) | speed up |
2929
|:----------------|:------------------------|:-------------------|:-------------------|
30-
| Apple M2 processor (ARM) | 6.2 | 3.8 | 1.6 x |
31-
| Intel Ice Lake (AVX2) | 5.3 | 3.4 | 1.6 x |
30+
| Apple M2 processor (ARM) | 6.3 | 3.8 | 1.7 x |
31+
| Intel Ice Lake (AVX2) | 5.8 | 3.4 | 1.7 x |
32+
| Intel Ice Lake (SSSE3) | 4.7 | 3.4 | 1.4 x |
3233

3334
Our results are more impressive when comparing against the standard base64 string decoding
3435
function (`Convert.FromBase64String(mystring)`), but we omit these results for simplicity.

benchmark/Benchmark.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,8 @@ public Config()
153153
}
154154
// Parameters and variables for real data
155155
[Params(
156-
@"data/email/" //,
157-
//@"data/dns/swedenzonebase.txt"
156+
@"data/email/" //,
157+
//@"data/dns/swedenzonebase.txt"
158158
)]
159159
#pragma warning disable CA1051
160160
public string? FileName;

src/Base64ARM.cs

Lines changed: 30 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ public static partial class Base64
1616
// If needed for debugging, you can do the following:
1717
/*static string VectorToString(Vector128<byte> vector)
1818
{
19-
Span<byte> bytes = new byte[16];
19+
Span<byte> bytes = stackalloc byte[16];
2020
vector.CopyTo(bytes);
2121
StringBuilder sb = new StringBuilder();
2222
foreach (byte b in bytes)
@@ -216,21 +216,20 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error)
216216
b->chunk3 += roll3;
217217
return badCharmask;
218218
}
219-
220219
[MethodImpl(MethodImplOptions.AggressiveInlining)]
221-
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output)
220+
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
222221
{
223222
ulong nmask = ~mask;
224-
Compress(b.chunk0, (ushort)mask, output);
225-
Compress(b.chunk1, (ushort)(mask >> 16), output + UInt64.PopCount(nmask & 0xFFFF));
226-
Compress(b.chunk2, (ushort)(mask >> 32), output + UInt64.PopCount(nmask & 0xFFFFFFFF));
227-
Compress(b.chunk3, (ushort)(mask >> 48), output + UInt64.PopCount(nmask & 0xFFFFFFFFFFFFUL));
223+
Compress(b.chunk0, (ushort)mask, output, tablePtr);
224+
Compress(b.chunk1, (ushort)(mask >> 16), output + UInt64.PopCount(nmask & 0xFFFF), tablePtr);
225+
Compress(b.chunk2, (ushort)(mask >> 32), output + UInt64.PopCount(nmask & 0xFFFFFFFF), tablePtr);
226+
Compress(b.chunk3, (ushort)(mask >> 48), output + UInt64.PopCount(nmask & 0xFFFFFFFFFFFFUL), tablePtr);
228227

229228
return UInt64.PopCount(nmask);
230229
}
231230

232231
[MethodImpl(MethodImplOptions.AggressiveInlining)]
233-
private static unsafe void Compress(Vector128<byte> data, ushort mask, byte* output)
232+
private static unsafe void Compress(Vector128<byte> data, ushort mask, byte* output, byte* tablePtr)
234233
{
235234
if (mask == 0)
236235
{
@@ -246,8 +245,8 @@ private static unsafe void Compress(Vector128<byte> data, ushort mask, byte* out
246245
// thintable_epi8[mask2] into a 128-bit register, using only
247246
// two instructions on most compilers.
248247

249-
ulong value1 = Tables.thintableEpi8[mask1];
250-
ulong value2 = Tables.thintableEpi8[mask2];
248+
ulong value1 = Tables.GetThintableEpi8(mask1);
249+
ulong value2 = Tables.GetThintableEpi8(mask2);
251250

252251
Vector128<sbyte> shufmask = Vector128.Create(value2, value1).AsSByte();
253252

@@ -259,19 +258,15 @@ private static unsafe void Compress(Vector128<byte> data, ushort mask, byte* out
259258
Vector128<sbyte> pruned = AdvSimd.Arm64.VectorTableLookup(data.AsSByte(), shufmask);
260259
// we still need to put the two halves together.
261260
// we compute the popcount of the first half:
262-
int pop1 = Tables.BitsSetTable256mul2[mask1];
261+
int pop1 = Tables.GetBitsSetTable256mul2(mask1);
263262
// then load the corresponding mask, what it does is to write
264263
// only the first pop1 bytes from the first 8 bytes, and then
265264
// it fills in with the bytes from the second 8 bytes + some filling
266-
// at the end.
265+
// at the end.
266+
Vector128<byte> compactmask = Vector128.Load(tablePtr + pop1 * 8);
267267

268-
fixed (byte* tablePtr = Tables.pshufbCombineTable)
269-
{
270-
Vector128<byte> compactmask = Vector128.Load(tablePtr + pop1 * 8);
271-
272-
Vector128<byte> answer = AdvSimd.Arm64.VectorTableLookup(pruned.AsByte(), compactmask);
273-
Vector128.Store(answer, output);
274-
}
268+
Vector128<byte> answer = AdvSimd.Arm64.VectorTableLookup(pruned.AsByte(), compactmask);
269+
Vector128.Store(answer, output);
275270
}
276271

277272
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -341,7 +336,6 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
341336
{
342337
// translation from ASCII to 6 bit values
343338
bool isUrl = false;
344-
byte[] toBase64 = Tables.ToBase64Value;
345339
bytesConsumed = 0;
346340
bytesWritten = 0;
347341
const int blocksSize = 6;
@@ -352,6 +346,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
352346
fixed (byte* srcInit = source)
353347
fixed (byte* dstInit = dest)
354348
fixed (byte* startOfBuffer = buffer)
349+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
355350
{
356351
byte* srcEnd = srcInit + source.Length;
357352
byte* src = srcInit;
@@ -429,7 +424,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
429424
// optimization opportunity: check for simple masks like those made of
430425
// continuous 1s followed by continuous 0s. And masks containing a
431426
// single bad character.
432-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
427+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
433428
bufferPtr += compressedBytesCount;
434429
bufferBytesConsumed += compressedBytesCount;
435430
}
@@ -469,7 +464,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
469464
int lastBlockSrcCount = 0;
470465
while ((bufferPtr - startOfBuffer) % 64 != 0 && src < srcEnd)
471466
{
472-
byte val = toBase64[(int)*src];
467+
byte val = SimdBase64.Tables.GetToBase64Value((uint)*src);
473468
*bufferPtr = val;
474469
if (val > 64)
475470
{
@@ -533,7 +528,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
533528

534529
while (leftover < 4 && src < srcEnd)
535530
{
536-
byte val = toBase64[(byte)*src];
531+
byte val = SimdBase64.Tables.GetToBase64Value((byte)*src);
537532
if (val > 64)
538533
{
539534
bytesConsumed = (int)(src - srcInit);
@@ -645,7 +640,6 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
645640
{
646641
// translation from ASCII to 6 bit values
647642
bool isUrl = false;
648-
byte[] toBase64 = Tables.ToBase64Value;
649643
bytesConsumed = 0;
650644
bytesWritten = 0;
651645
const int blocksSize = 6;
@@ -656,6 +650,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
656650
fixed (char* srcInit = source)
657651
fixed (byte* dstInit = dest)
658652
fixed (byte* startOfBuffer = buffer)
653+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
659654
{
660655
char* srcEnd = srcInit + source.Length;
661656
char* src = srcInit;
@@ -730,7 +725,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
730725
// optimization opportunity: check for simple masks like those made of
731726
// continuous 1s followed by continuous 0s. And masks containing a
732727
// single bad character.
733-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
728+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
734729
bufferPtr += compressedBytesCount;
735730
bufferBytesConsumed += compressedBytesCount;
736731
}
@@ -787,7 +782,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
787782
bytesWritten += remainderBytesWritten;
788783
return result;
789784
}
790-
byte val = toBase64[(int)*src];
785+
byte val = SimdBase64.Tables.GetToBase64Value((uint)*src);
791786
*bufferPtr = val;
792787
if (val > 64)
793788
{
@@ -858,7 +853,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
858853
return OperationStatus.InvalidData;
859854
}
860855

861-
byte val = toBase64[(byte)*src];
856+
byte val = SimdBase64.Tables.GetToBase64Value((uint)*src);
862857
if (val > 64)
863858
{
864859
bytesConsumed = (int)(src - srcInit);
@@ -971,7 +966,6 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<b
971966
{
972967
// translation from ASCII to 6 bit values
973968
bool isUrl = true;
974-
byte[] toBase64 = Tables.ToBase64UrlValue;
975969
bytesConsumed = 0;
976970
bytesWritten = 0;
977971
const int blocksSize = 6;
@@ -982,6 +976,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<b
982976
fixed (byte* srcInit = source)
983977
fixed (byte* dstInit = dest)
984978
fixed (byte* startOfBuffer = buffer)
979+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
985980
{
986981
byte* srcEnd = srcInit + source.Length;
987982
byte* src = srcInit;
@@ -1056,7 +1051,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<b
10561051
// optimization opportunity: check for simple masks like those made of
10571052
// continuous 1s followed by continuous 0s. And masks containing a
10581053
// single bad character.
1059-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
1054+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
10601055
bufferPtr += compressedBytesCount;
10611056
bufferBytesConsumed += compressedBytesCount;
10621057

@@ -1098,7 +1093,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<b
10981093
int lastBlockSrcCount = 0;
10991094
while ((bufferPtr - startOfBuffer) % 64 != 0 && src < srcEnd)
11001095
{
1101-
byte val = toBase64[(int)*src];
1096+
byte val = Tables.GetToBase64UrlValue((byte)*src);
11021097
*bufferPtr = val;
11031098
if (val > 64)
11041099
{
@@ -1163,7 +1158,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<b
11631158

11641159
while (leftover < 4 && src < srcEnd)
11651160
{
1166-
byte val = toBase64[(byte)*src];
1161+
byte val = Tables.GetToBase64UrlValue((byte)*src);
11671162
if (val > 64)
11681163
{
11691164
bytesConsumed = (int)(src - srcInit);
@@ -1277,7 +1272,6 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<c
12771272
{
12781273
// translation from ASCII to 6 bit values
12791274
bool isUrl = true;
1280-
byte[] toBase64 = Tables.ToBase64UrlValue;
12811275
bytesConsumed = 0;
12821276
bytesWritten = 0;
12831277
const int blocksSize = 6;
@@ -1288,6 +1282,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<c
12881282
fixed (char* srcInit = source)
12891283
fixed (byte* dstInit = dest)
12901284
fixed (byte* startOfBuffer = buffer)
1285+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
12911286
{
12921287
char* srcEnd = srcInit + source.Length;
12931288
char* src = srcInit;
@@ -1365,7 +1360,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<c
13651360
// optimization opportunity: check for simple masks like those made of
13661361
// continuous 1s followed by continuous 0s. And masks containing a
13671362
// single bad character.
1368-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
1363+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
13691364
bufferPtr += compressedBytesCount;
13701365
bufferBytesConsumed += compressedBytesCount;
13711366

@@ -1424,7 +1419,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<c
14241419
bytesWritten += remainderBytesWritten;
14251420
return result;
14261421
}
1427-
byte val = toBase64[(int)*src];
1422+
byte val = Tables.GetToBase64UrlValue((byte)*src);
14281423
*bufferPtr = val;
14291424
if (val > 64)
14301425
{
@@ -1495,7 +1490,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<c
14951490
bytesWritten = (int)(dst - dstInit);
14961491
return OperationStatus.InvalidData;
14971492
}
1498-
byte val = toBase64[(byte)*src];
1493+
byte val = Tables.GetToBase64UrlValue((byte)*src);
14991494
if (val > 64)
15001495
{
15011496
bytesConsumed = (int)(src - srcInit);

0 commit comments

Comments
 (0)