Skip to content

Commit 29d6bac

Browse files
author
Daniel Lemire
committed
porting to AVX2
1 parent 4992306 commit 29d6bac

File tree

5 files changed

+130
-13
lines changed

5 files changed

+130
-13
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ fully reproducible.
3434
|:----------------|:------------------------|:-------------------|:-------------------|
3535
| Apple M2 processor (ARM, 3.5 Ghz) | 10 | 3.8 | 2.6 x |
3636
| AWS Graviton 3 (ARM, 2.6 GHz) | 5.1 | 2.0 | 2.6 x |
37-
| Intel Ice Lake (2.0 GHz) | 6.5 | 3.4 | 1.9 x |
37+
| Intel Ice Lake (2.0 GHz) | 7.6 | 3.4 | 2.2 x |
3838
| AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.8 | 2.9 | 2.3 x |
3939

4040
## Results (SimdBase64 vs. string .NET functions)

src/Base64.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan<byte> source,
3636
//if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
3737
//{
3838
//}
39-
if (Avx2.IsSupported)
39+
if (Avx2.IsSupported && Popcnt.IsSupported && Bmi1.IsSupported)
4040
{
4141
return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl);
4242
}
@@ -61,7 +61,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan<char> source,
6161
//{
6262
// return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
6363
//}
64-
if (Avx2.IsSupported)
64+
if (Avx2.IsSupported && Popcnt.IsSupported && Bmi1.IsSupported)
6565
{
6666
return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl);
6767
}

src/Base64ARM.cs

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -219,15 +219,12 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error)
219219
[MethodImpl(MethodImplOptions.AggressiveInlining)]
220220
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
221221
{
222-
223222
// if mask is a power of 2, we can use a simpler version
224223
if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
225224
{
226225
int pos64 = ArmBase.Arm64.LeadingZeroCount(mask);
227226
int pos = pos64 & 0xf;
228227
Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
229-
230-
231228
Vector128<byte> v0 = Vector128.Create((byte)(0xe - pos));
232229
switch (pos64 >> 4)
233230
{
@@ -240,7 +237,6 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu
240237
Vector128.Store(b.chunk1, output + 1 * 16 - 1);
241238
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
242239
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
243-
244240
}
245241
break;
246242

@@ -253,7 +249,6 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu
253249
Vector128.Store(compressed, output + 1 * 16);
254250
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
255251
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
256-
257252
}
258253
break;
259254

@@ -266,26 +261,22 @@ private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* outpu
266261
Vector128.Store(b.chunk1, output + 1 * 16);
267262
Vector128.Store(compressed, output + 2 * 16);
268263
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
269-
270264
}
271265
break;
272266

273267
case 0:
274268
{
275269
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
276270
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
277-
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh);
271+
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk3, sh);
278272
Vector128.Store(b.chunk0, output + 0 * 16);
279273
Vector128.Store(b.chunk1, output + 1 * 16);
280274
Vector128.Store(b.chunk2, output + 2 * 16);
281275
Vector128.Store(compressed, output + 3 * 16);
282276
}
283277
break;
284278
}
285-
286-
287279
return 63;
288-
289280
}
290281
ulong nmask = ~mask;
291282
Compress(b.chunk0, (ushort)mask, output, tablePtr);

src/Base64AVX2UTF8.cs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,69 @@ private static UInt64 ToBase64Mask(bool base64Url, ref Vector256<byte> src, ref
167167
[MethodImpl(MethodImplOptions.AggressiveInlining)]
168168
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
169169
{
170+
// if mask is a power of 2, we can use a simpler version
171+
if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
172+
{
173+
ulong pos64 = Bmi1.X64.TrailingZeroCount(mask);
174+
ulong pos = pos64 & 0xf;
175+
Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
176+
Vector128<byte> v0 = Vector128.Create((byte)(pos-1));
177+
switch (pos64 >> 4)
178+
{
179+
case 0:
180+
{
181+
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk0, 0);
182+
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
183+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
184+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
185+
Vector128<byte> compressed = Ssse3.Shuffle(chunk0, sh);
186+
Vector128.Store(compressed, output + 0 * 16);
187+
Vector128.Store(chunk1, output + 1 * 16 - 1);
188+
Vector256.Store(b.chunk1, output + 2 * 16 - 1);
189+
}
190+
break;
191+
192+
case 1:
193+
{
194+
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk0, 0);
195+
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
196+
Vector128<byte> v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
197+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
198+
Vector128<byte> compressed = Ssse3.Shuffle(chunk1, sh);
199+
Vector128.Store(chunk0, output + 0 * 16);
200+
Vector128.Store(compressed, output + 1 * 16);
201+
Vector256.Store(b.chunk1, output + 2 * 16 - 1);
202+
}
203+
break;
204+
205+
case 2:
206+
{
207+
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk1, 0);
208+
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
209+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
210+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
211+
Vector128<byte> compressed = Ssse3.Shuffle(chunk0, sh);
212+
Vector256.Store(b.chunk0, output + 0 * 16);
213+
Vector128.Store(compressed, output + 2 * 16);
214+
Vector128.Store(chunk1, output + 3 * 16 - 1);
215+
}
216+
break;
217+
218+
case 3:
219+
{
220+
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk1, 0);
221+
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
222+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
223+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
224+
Vector128<byte> compressed = Ssse3.Shuffle(chunk1, sh);
225+
Vector256.Store(b.chunk0, output + 0 * 16);
226+
Vector128.Store(chunk0, output + 2 * 16);
227+
Vector128.Store(compressed, output + 3 * 16);
228+
}
229+
break;
230+
}
231+
return 63;
232+
}
170233
ulong nmask = ~mask;
171234
Compress(b.chunk0, (UInt32)mask, output, tablePtr);
172235
Compress(b.chunk1, (UInt32)(mask >> 32), output + Popcnt.X64.PopCount(nmask & 0xFFFFFFFF), tablePtr);

src/Base64SSEUTF8.cs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System;
2+
using System.Numerics;
23
using System.Runtime.Intrinsics;
34
using System.Runtime.Intrinsics.X86;
45
using System.Runtime.CompilerServices;
@@ -131,6 +132,68 @@ private static ushort ToBase64Mask(bool base64Url, ref Vector128<byte> src, ref
131132
[MethodImpl(MethodImplOptions.AggressiveInlining)]
132133
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
133134
{
135+
// if mask is a power of 2, we can use a simpler version
136+
if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
137+
{
138+
int pos64 = BitOperations.TrailingZeroCount(mask);
139+
int pos = pos64 & 0xf;
140+
Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
141+
Vector128<byte> v0 = Vector128.Create((byte)(pos-1));
142+
switch (pos64 >> 4)
143+
{
144+
case 0:
145+
{
146+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
147+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
148+
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk0, sh);
149+
Vector128.Store(compressed, output + 0 * 16);
150+
Vector128.Store(b.chunk1, output + 1 * 16 - 1);
151+
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
152+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
153+
154+
}
155+
break;
156+
157+
case 1:
158+
{
159+
Vector128<byte> v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
160+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
161+
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk1, sh);
162+
Vector128.Store(b.chunk0, output + 0 * 16);
163+
Vector128.Store(compressed, output + 1 * 16);
164+
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
165+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
166+
167+
}
168+
break;
169+
170+
case 2:
171+
{
172+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
173+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
174+
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk2, sh);
175+
Vector128.Store(b.chunk0, output + 0 * 16);
176+
Vector128.Store(b.chunk1, output + 1 * 16);
177+
Vector128.Store(compressed, output + 2 * 16);
178+
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
179+
180+
}
181+
break;
182+
183+
case 3:
184+
{
185+
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
186+
Vector128<byte> sh = Sse2.Subtract(v1, v2);
187+
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk3, sh);
188+
Vector128.Store(b.chunk0, output + 0 * 16);
189+
Vector128.Store(b.chunk1, output + 1 * 16);
190+
Vector128.Store(b.chunk2, output + 2 * 16);
191+
Vector128.Store(compressed, output + 3 * 16);
192+
}
193+
break;
194+
}
195+
return 63;
196+
}
134197
ulong nmask = ~mask;
135198
Compress(b.chunk0, (ushort)mask, output, tablePtr);
136199
Compress(b.chunk1, (ushort)(mask >> 16), output + Popcnt.X64.PopCount(nmask & 0xFFFF), tablePtr);

0 commit comments

Comments
 (0)