Skip to content

Commit b9034ef

Browse files
committed
format
1 parent d1716ab commit b9034ef

File tree

7 files changed

+55
-57
lines changed

7 files changed

+55
-57
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ as a reference (`System.Buffers.Text.Base64.DecodeFromUtf8`).
2727

2828
| processor | SimdBase64(GB/s) | .NET speed (GB/s) | speed up |
2929
|:----------------|:------------------------|:-------------------|:-------------------|
30-
| Apple M2 processor (ARM) | 6.2 | 3.8 | 1.6 x |
30+
| Apple M2 processor (ARM) | 6.3 | 3.8 | 1.6 x |
3131
| Intel Ice Lake (AVX2) | 5.3 | 3.4 | 1.6 x |
3232

3333
Our results are more impressive when comparing against the standard base64 string decoding

src/Base64ARM.cs

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -216,21 +216,20 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error)
216216
b->chunk3 += roll3;
217217
return badCharmask;
218218
}
219-
220219
[MethodImpl(MethodImplOptions.AggressiveInlining)]
221-
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output)
220+
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
222221
{
223222
ulong nmask = ~mask;
224-
Compress(b.chunk0, (ushort)mask, output);
225-
Compress(b.chunk1, (ushort)(mask >> 16), output + UInt64.PopCount(nmask & 0xFFFF));
226-
Compress(b.chunk2, (ushort)(mask >> 32), output + UInt64.PopCount(nmask & 0xFFFFFFFF));
227-
Compress(b.chunk3, (ushort)(mask >> 48), output + UInt64.PopCount(nmask & 0xFFFFFFFFFFFFUL));
223+
Compress(b.chunk0, (ushort)mask, output, tablePtr);
224+
Compress(b.chunk1, (ushort)(mask >> 16), output + UInt64.PopCount(nmask & 0xFFFF), tablePtr);
225+
Compress(b.chunk2, (ushort)(mask >> 32), output + UInt64.PopCount(nmask & 0xFFFFFFFF), tablePtr);
226+
Compress(b.chunk3, (ushort)(mask >> 48), output + UInt64.PopCount(nmask & 0xFFFFFFFFFFFFUL), tablePtr);
228227

229228
return UInt64.PopCount(nmask);
230229
}
231230

232231
[MethodImpl(MethodImplOptions.AggressiveInlining)]
233-
private static unsafe void Compress(Vector128<byte> data, ushort mask, byte* output)
232+
private static unsafe void Compress(Vector128<byte> data, ushort mask, byte* output, byte* tablePtr)
234233
{
235234
if (mask == 0)
236235
{
@@ -263,15 +262,11 @@ private static unsafe void Compress(Vector128<byte> data, ushort mask, byte* out
263262
// then load the corresponding mask, what it does is to write
264263
// only the first pop1 bytes from the first 8 bytes, and then
265264
// it fills in with the bytes from the second 8 bytes + some filling
266-
// at the end.
265+
// at the end.
266+
Vector128<byte> compactmask = Vector128.Load(tablePtr + pop1 * 8);
267267

268-
fixed (byte* tablePtr = Tables.pshufbCombineTable)
269-
{
270-
Vector128<byte> compactmask = Vector128.Load(tablePtr + pop1 * 8);
271-
272-
Vector128<byte> answer = AdvSimd.Arm64.VectorTableLookup(pruned.AsByte(), compactmask);
273-
Vector128.Store(answer, output);
274-
}
268+
Vector128<byte> answer = AdvSimd.Arm64.VectorTableLookup(pruned.AsByte(), compactmask);
269+
Vector128.Store(answer, output);
275270
}
276271

277272
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -351,6 +346,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
351346
fixed (byte* srcInit = source)
352347
fixed (byte* dstInit = dest)
353348
fixed (byte* startOfBuffer = buffer)
349+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
354350
{
355351
byte* srcEnd = srcInit + source.Length;
356352
byte* src = srcInit;
@@ -428,7 +424,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
428424
// optimization opportunity: check for simple masks like those made of
429425
// continuous 1s followed by continuous 0s. And masks containing a
430426
// single bad character.
431-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
427+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
432428
bufferPtr += compressedBytesCount;
433429
bufferBytesConsumed += compressedBytesCount;
434430
}
@@ -654,6 +650,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
654650
fixed (char* srcInit = source)
655651
fixed (byte* dstInit = dest)
656652
fixed (byte* startOfBuffer = buffer)
653+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
657654
{
658655
char* srcEnd = srcInit + source.Length;
659656
char* src = srcInit;
@@ -728,7 +725,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp
728725
// optimization opportunity: check for simple masks like those made of
729726
// continuous 1s followed by continuous 0s. And masks containing a
730727
// single bad character.
731-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
728+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
732729
bufferPtr += compressedBytesCount;
733730
bufferBytesConsumed += compressedBytesCount;
734731
}
@@ -979,6 +976,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<b
979976
fixed (byte* srcInit = source)
980977
fixed (byte* dstInit = dest)
981978
fixed (byte* startOfBuffer = buffer)
979+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
982980
{
983981
byte* srcEnd = srcInit + source.Length;
984982
byte* src = srcInit;
@@ -1053,7 +1051,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<b
10531051
// optimization opportunity: check for simple masks like those made of
10541052
// continuous 1s followed by continuous 0s. And masks containing a
10551053
// single bad character.
1056-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
1054+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
10571055
bufferPtr += compressedBytesCount;
10581056
bufferBytesConsumed += compressedBytesCount;
10591057

@@ -1284,6 +1282,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<c
12841282
fixed (char* srcInit = source)
12851283
fixed (byte* dstInit = dest)
12861284
fixed (byte* startOfBuffer = buffer)
1285+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
12871286
{
12881287
char* srcEnd = srcInit + source.Length;
12891288
char* src = srcInit;
@@ -1361,7 +1360,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan<c
13611360
// optimization opportunity: check for simple masks like those made of
13621361
// continuous 1s followed by continuous 0s. And masks containing a
13631362
// single bad character.
1364-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
1363+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
13651364
bufferPtr += compressedBytesCount;
13661365
bufferBytesConsumed += compressedBytesCount;
13671366

src/Base64AVX2UTF16.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64AVX2Regular(ReadOnlyS
3535
fixed (char* srcInit = source)
3636
fixed (byte* dstInit = dest)
3737
fixed (byte* startOfBuffer = buffer)
38+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
3839
{
3940
char* srcEnd = srcInit + source.Length;
4041
char* src = srcInit;
@@ -118,7 +119,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64AVX2Regular(ReadOnlyS
118119
// optimization opportunity: check for simple masks like those made of
119120
// continuous 1s followed by continuous 0s. And masks containing a
120121
// single bad character.
121-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
122+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
122123
bufferPtr += compressedBytesCount;
123124
bufferBytesConsumed += compressedBytesCount;
124125

@@ -399,6 +400,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64AVX2Url(ReadOnlySpan<
399400
fixed (char* srcInit = source)
400401
fixed (byte* dstInit = dest)
401402
fixed (byte* startOfBuffer = buffer)
403+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
402404
{
403405
char* srcEnd = srcInit + source.Length;
404406
char* src = srcInit;
@@ -481,7 +483,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64AVX2Url(ReadOnlySpan<
481483
// optimization opportunity: check for simple masks like those made of
482484
// continuous 1s followed by continuous 0s. And masks containing a
483485
// single bad character.
484-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
486+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
485487
bufferPtr += compressedBytesCount;
486488
bufferBytesConsumed += compressedBytesCount;
487489

src/Base64AVX2UTF8.cs

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -165,17 +165,17 @@ private static UInt64 ToBase64Mask(bool base64Url, ref Vector256<byte> src, ref
165165
}
166166

167167
[MethodImpl(MethodImplOptions.AggressiveInlining)]
168-
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output)
168+
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
169169
{
170170
ulong nmask = ~mask;
171-
Compress(b.chunk0, (UInt32)mask, output);
172-
Compress(b.chunk1, (UInt32)(mask >> 32), output + Popcnt.X64.PopCount(nmask & 0xFFFFFFFF));
171+
Compress(b.chunk0, (UInt32)mask, output, tablePtr);
172+
Compress(b.chunk1, (UInt32)(mask >> 32), output + Popcnt.X64.PopCount(nmask & 0xFFFFFFFF), tablePtr);
173173

174174
return Popcnt.X64.PopCount(nmask);
175175
}
176176

177177
[MethodImpl(MethodImplOptions.AggressiveInlining)] // This Compress is the same as in SSE
178-
private static unsafe void Compress(Vector128<byte> data, ushort mask, byte* output)
178+
private static unsafe void Compress(Vector128<byte> data, ushort mask, byte* output, byte* tablePtr)
179179
{
180180
if (mask == 0)
181181
{
@@ -208,17 +208,13 @@ private static unsafe void Compress(Vector128<byte> data, ushort mask, byte* out
208208
// only the first pop1 bytes from the first 8 bytes, and then
209209
// it fills in with the bytes from the second 8 bytes + some filling
210210
// at the end.
211+
Vector128<byte> compactmask = Sse2.LoadVector128(tablePtr + pop1 * 8);
211212

212-
fixed (byte* tablePtr = Tables.pshufbCombineTable)
213-
{
214-
Vector128<byte> compactmask = Sse2.LoadVector128(tablePtr + pop1 * 8);
215-
216-
Vector128<byte> answer = Ssse3.Shuffle(pruned.AsByte(), compactmask);
217-
Sse2.Store(output, answer);
218-
}
213+
Vector128<byte> answer = Ssse3.Shuffle(pruned.AsByte(), compactmask);
214+
Sse2.Store(output, answer);
219215
}
220216

221-
public static unsafe void Compress(Vector256<byte> data, uint mask, byte* output)
217+
public static unsafe void Compress(Vector256<byte> data, uint mask, byte* output, byte* tablePtr)
222218
{
223219
if (mask == 0)
224220
{
@@ -227,11 +223,11 @@ public static unsafe void Compress(Vector256<byte> data, uint mask, byte* output
227223
}
228224

229225
// Perform compression on the lower 128 bits
230-
Compress(data.GetLower().AsByte(), (ushort)mask, output);
226+
Compress(data.GetLower().AsByte(), (ushort)mask, output, tablePtr);
231227

232228
// Perform compression on the upper 128 bits, shifting output pointer by the number of 1's in the lower 16 bits of mask
233229
int popCount = (int)Popcnt.PopCount(~mask & 0xFFFF);
234-
Compress(Avx2.ExtractVector128(data.AsByte(), 1), (ushort)(mask >> 16), output + popCount);
230+
Compress(Avx2.ExtractVector128(data.AsByte(), 1), (ushort)(mask >> 16), output + popCount, tablePtr);
235231
}
236232

237233
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -331,12 +327,13 @@ private unsafe static OperationStatus InnerDecodeFromBase64AVX2Regular(ReadOnlyS
331327
bytesWritten = 0;
332328
const int blocksSize = 6;
333329
// Should be
334-
//Span<byte> buffer = stackalloc byte[blocksSize * 64];
330+
// Span<byte> buffer = stackalloc byte[blocksSize * 64];
335331
Span<byte> buffer = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
336332
// Define pointers within the fixed blocks
337333
fixed (byte* srcInit = source)
338334
fixed (byte* dstInit = dest)
339335
fixed (byte* startOfBuffer = buffer)
336+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
340337
{
341338
byte* srcEnd = srcInit + source.Length;
342339
byte* src = srcInit;
@@ -420,7 +417,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64AVX2Regular(ReadOnlyS
420417
// optimization opportunity: check for simple masks like those made of
421418
// continuous 1s followed by continuous 0s. And masks containing a
422419
// single bad character.
423-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
420+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
424421
bufferPtr += compressedBytesCount;
425422
bufferBytesConsumed += compressedBytesCount;
426423

@@ -668,6 +665,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64AVX2Url(ReadOnlySpan<
668665
fixed (byte* srcInit = source)
669666
fixed (byte* dstInit = dest)
670667
fixed (byte* startOfBuffer = buffer)
668+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
671669
{
672670
byte* srcEnd = srcInit + source.Length;
673671
byte* src = srcInit;
@@ -750,7 +748,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64AVX2Url(ReadOnlySpan<
750748
// optimization opportunity: check for simple masks like those made of
751749
// continuous 1s followed by continuous 0s. And masks containing a
752750
// single bad character.
753-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
751+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
754752
bufferPtr += compressedBytesCount;
755753
bufferBytesConsumed += compressedBytesCount;
756754

src/Base64SSEUTF16.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64SSERegular(ReadOnlySp
3636
fixed (char* srcInit = source)
3737
fixed (byte* dstInit = dest)
3838
fixed (byte* startOfBuffer = buffer)
39+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
3940
{
4041
char* srcEnd = srcInit + source.Length;
4142
char* src = srcInit;
@@ -118,7 +119,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64SSERegular(ReadOnlySp
118119
// optimization opportunity: check for simple masks like those made of
119120
// continuous 1s followed by continuous 0s. And masks containing a
120121
// single bad character.
121-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
122+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
122123
bufferPtr += compressedBytesCount;
123124
bufferBytesConsumed += compressedBytesCount;
124125

@@ -397,6 +398,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64SSEUrl(ReadOnlySpan<c
397398
fixed (char* srcInit = source)
398399
fixed (byte* dstInit = dest)
399400
fixed (byte* startOfBuffer = buffer)
401+
fixed (byte* tablePtr = Tables.pshufbCombineTable)
400402
{
401403
char* srcEnd = srcInit + source.Length;
402404
char* src = srcInit;
@@ -479,7 +481,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64SSEUrl(ReadOnlySpan<c
479481
// optimization opportunity: check for simple masks like those made of
480482
// continuous 1s followed by continuous 0s. And masks containing a
481483
// single bad character.
482-
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr);
484+
ulong compressedBytesCount = CompressBlock(ref b, badCharMask, bufferPtr, tablePtr);
483485
bufferPtr += compressedBytesCount;
484486
bufferBytesConsumed += compressedBytesCount;
485487

0 commit comments

Comments
 (0)