Skip to content

Commit 06fea92

Browse files
committed
Saving...
1 parent 0e8e90e commit 06fea92

File tree

9 files changed

+311
-60
lines changed

9 files changed

+311
-60
lines changed

README.md

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,51 @@
11
# SimdUnicode
22

3-
This is a fast C# library to process unicode strings.
3+
This is a fast C# library to process unicode strings.
4+
5+
*It is currently not meant to be usable.*
6+
7+
Our goal is to provide faster methods than
8+
https://learn.microsoft.com/en-us/dotnet/api/system.text.encoding?view=net-7.0
9+
10+
## Requirements
11+
12+
We recommend you install .NET 7: https://dotnet.microsoft.com/en-us/download/dotnet/7.0
13+
14+
15+
## Running tests
16+
17+
```
18+
cd test
19+
dotnet test
20+
```
21+
22+
## Running Benchmarks
23+
24+
```
25+
cd benchmark
26+
dotnet run -c Release
27+
```
28+
29+
If you are under macOS or Linux, you may want to run the benchmarks in privileged mode:
30+
31+
```
32+
cd benchmark
33+
sudo dotnet run -c Release
34+
```
35+
36+
37+
## Building the library
38+
39+
```
40+
cd src
41+
dotnet build
42+
```
43+
44+
## Code format
45+
46+
We recommend you use `dotnet format`. E.g.,
47+
48+
```
49+
cd test
50+
dotnet format
51+
```

benchmark/Benchmark.cs

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
using System;
2+
using SimdUnicode;
3+
using BenchmarkDotNet.Attributes;
4+
using BenchmarkDotNet.Running;
5+
using System.Text;
6+
using System.Runtime;
7+
using System.Runtime.InteropServices;
8+
namespace SimdUnicodeBenchmarks
9+
{
10+
public class Checker
11+
{
12+
List<char[]> names;
13+
List<bool> results;
14+
15+
public static char[] GetRandomASCIIString(uint n)
16+
{
17+
var allowedChars = "abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNOPQRSTUVWXYZ01234567é89";
18+
19+
var chars = new char[n];
20+
var rd = new Random(12345); // fixed seed
21+
22+
for (var i = 0; i < n; i++)
23+
{
24+
chars[i] = allowedChars[rd.Next(0, allowedChars.Length)];
25+
}
26+
27+
return chars;
28+
}
29+
30+
31+
[Params(100, 200, 500)]
32+
public uint N;
33+
34+
[GlobalSetup]
35+
public void Setup()
36+
{
37+
names = new List<char[]>();
38+
results = new List<bool>();
39+
40+
for (int i = 0; i < 100; i++)
41+
{
42+
names.Add(GetRandomASCIIString(N));
43+
results.Add(false);
44+
}
45+
}
46+
47+
[Benchmark]
48+
public void FastUnicodeIsAscii()
49+
{
50+
int count = 0;
51+
foreach (char[] name in names)
52+
{
53+
results[count] = SimdUnicode.Ascii.SIMDIsAscii(name);
54+
count += 1;
55+
}
56+
}
57+
58+
[Benchmark]
59+
public void StandardUnicodeIsAscii()
60+
{
61+
int count = 0;
62+
foreach (char[] name in names)
63+
{
64+
results[count] = SimdUnicode.Ascii.IsAscii(name);
65+
count += 1;
66+
}
67+
}
68+
69+
[Benchmark]
70+
public void RuntimeIsAscii()
71+
{
72+
int count = 0;
73+
foreach (char[] name in names)
74+
{
75+
results[count] = (Encoding.ASCII.GetByteCount(name) == name.Length);
76+
count += 1;
77+
}
78+
}
79+
80+
}
81+
82+
public class Program
83+
{
84+
public static void Main(string[] args)
85+
{
86+
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
87+
{
88+
Console.WriteLine("ARM64 system detected.");
89+
}
90+
else if (RuntimeInformation.ProcessArchitecture == Architecture.X64)
91+
{
92+
Console.WriteLine("X64 system detected (Intel, AMD,...).");
93+
94+
}
95+
else
96+
{
97+
Console.WriteLine("Unrecognized system.");
98+
99+
}
100+
var summary = BenchmarkRunner.Run<Checker>();
101+
Console.WriteLine("The RuntimeIsAscii is too fast. The execution time does not depend on the string length.");
102+
Console.WriteLine("It is assuredly cheating..");
103+
104+
}
105+
}
106+
}

benchmark/benchmark.csproj

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net7.0</TargetFramework>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<Nullable>enable</Nullable>
8+
</PropertyGroup>
9+
10+
<ItemGroup>
11+
<PackageReference Include="BenchmarkDotNet" Version="0.13.5" />
12+
</ItemGroup>
13+
14+
<ItemGroup>
15+
<ProjectReference Include="..\src\SimdUnicode.csproj" />
16+
</ItemGroup>
17+
18+
</Project>

info/ORGANIZATION.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
We follow the recommended file and directory layout from
2-
https://learn.microsoft.com/en-us/dotnet/core/tutorials/testing-with-cli
2+
https://learn.microsoft.com/en-us/dotnet/core/tutorials/testing-with-cli
3+
4+
For benchmarking, we follow the instructions from
5+
https://benchmarkdotnet.org/articles/guides/getting-started.html

src/Ascii.cs

Lines changed: 115 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,131 @@
11
using System;
2+
using System.Runtime.Intrinsics;
3+
using System.Runtime.Intrinsics.X86;
4+
using System.Runtime.Intrinsics.Arm;
5+
using System.Runtime.CompilerServices;
6+
using System.Runtime.InteropServices;
27

3-
namespace SimdUnicode {
4-
public static class Ascii {
8+
9+
// Ideally, we would want to implement something that looks like
10+
// https://learn.microsoft.com/en-us/dotnet/api/system.text.asciiencoding?view=net-7.0
11+
//
12+
// See https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.cs
13+
//
14+
// See https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Transcoding.cs
15+
namespace SimdUnicode
16+
{
17+
public unsafe static class Ascii
18+
{
19+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
520
public static bool IsAscii(this char c) => c < 128;
6-
public static bool IsAscii(this string s) {
7-
foreach (var c in s) {
21+
22+
public static bool IsAscii(this string s)
23+
{
24+
foreach (var c in s)
25+
{
826
if (!c.IsAscii()) return false;
927
}
1028
return true;
1129
}
12-
public static bool IsAscii(this ReadOnlySpan<char> s) {
13-
foreach (var c in s) {
30+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
31+
public static bool IsAscii(this ReadOnlySpan<char> s)
32+
{
33+
foreach (var c in s)
34+
{
1435
if (!c.IsAscii()) return false;
1536
}
1637
return true;
1738
}
18-
public static bool IsAscii(this Span<char> s) {
19-
foreach (var c in s) {
39+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
40+
public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
41+
{
42+
if (s.IsEmpty) return true;
43+
44+
if (ArmBase.Arm64.IsSupported)
45+
{
46+
47+
// We are going to OR together all the results and then use
48+
// the maximum value to determine if any of the characters
49+
// exceeds the ASCII range. See
50+
// https://github.com/simdutf/simdutf/blob/master/src/arm64/implementation.cpp
51+
52+
// There is not a lot of documentation, but we can read the code at
53+
// https://github.com/dotnet/runtime/tree/main/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Arm
54+
// and see examples at
55+
// https://github.com/dotnet/runtime/blob/main/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/OptimizedInboxTextEncoder.AdvSimd64.cs
56+
57+
// Go through https://learn.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics.arm.advsimd.arm64.maxacross?view=net-8.0
58+
fixed (char* pStart = &MemoryMarshal.GetReference(s))
59+
{
60+
ushort max_so_far = 0;
61+
int i = 0;
62+
if (s.Length > 8)
63+
{
64+
// instead of a load, we could have set it to zero, like so...
65+
// total = Vector128<ushort>.Zero;
66+
// or to a custome value like this:
67+
// total = DuplicateToVector128((char)0);
68+
Vector128<ushort> total = AdvSimd.LoadVector128((ushort*)pStart);
69+
i += 8;
70+
// unrolling could be useful here:
71+
for (; i + 7 < s.Length; i += 8)
72+
{
73+
Vector128<ushort> raw = AdvSimd.LoadVector128((ushort*)pStart + i);
74+
total = AdvSimd.Or(total, raw);
75+
}
76+
77+
max_so_far =
78+
AdvSimd.Arm64.MaxAcross(total).ToScalar();
79+
}
80+
for (; i < s.Length; i++)
81+
{
82+
if (pStart[i] > max_so_far) { max_so_far = pStart[i]; }
83+
}
84+
return max_so_far < 128;
85+
}
86+
}
87+
else if (Sse41.IsSupported)
88+
{
89+
// Go through https://learn.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics.x86.sse2.comparelessthan?view=net-8.0
90+
fixed (char* pStart = &MemoryMarshal.GetReference(s))
91+
{
92+
int i = 0;
93+
if (s.Length > 8)
94+
{
95+
Vector128<ushort> total = Sse41.LoadDquVector128((ushort*)pStart);
96+
i += 8;
97+
// unrolling could be useful here:
98+
for (; i + 7 < s.Length; i += 8)
99+
{
100+
Vector128<ushort> raw = Sse41.LoadDquVector128((ushort*)pStart + i);
101+
total = Sse2.Or(total, raw);
102+
}
103+
Vector128<ushort> b127 = Vector128.Create((ushort)127);
104+
Vector128<ushort> b = Sse41.Max(b127, total);
105+
Vector128<ushort> b16 = Sse41.CompareEqual(b, b127);
106+
int movemask = Sse2.MoveMask(b16.AsByte());
107+
if (movemask != 0xfffff)
108+
{
109+
return false;
110+
}
111+
}
112+
for (; i < s.Length; i++)
113+
{
114+
if (pStart[i] >= 128) return false;
115+
}
116+
return true;
117+
}
118+
}
119+
// Fallback code
120+
121+
foreach (var c in s)
122+
{
20123
if (!c.IsAscii()) return false;
21124
}
22125
return true;
23126
}
24-
public static bool IsAscii(this ReadOnlyMemory<char> s) => IsAscii(s.Span);
25-
public static bool IsAscii(this Memory<char> s) => IsAscii(s.Span);
26127
}
27-
}
128+
}
129+
// Further reading:
130+
// https://github.com/dotnet/runtime/blob/main/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs
131+

src/SimdUnicode.csproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
<OutputType>Library</OutputType>
55
<TargetFramework>net7.0</TargetFramework>
66
<Nullable>enable</Nullable>
7+
<!-- This is required for SIMD, sse c# - How to run unsafe code in "visual studio code"? - Stack Overflow https://stackoverflow.com/questions/50636693/how-to-run-unsafe-code-in-visual-studio-code -->
8+
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
79
</PropertyGroup>
810

911
</Project>

src/UTF16.cs

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,9 @@
11
using System;
22

3-
namespace SimdUnicode {
4-
public static class UTF16 {
5-
public static bool IsUTF16(this char c) => c < 128;
6-
public static bool IsUTF16(this string s) {
7-
foreach (var c in s) {
8-
if (!c.IsUTF16()) return false;
9-
}
10-
return true;
11-
}
12-
public static bool IsUTF16(this ReadOnlySpan<char> s) {
13-
foreach (var c in s) {
14-
if (!c.IsUTF16()) return false;
15-
}
16-
return true;
17-
}
18-
public static bool IsUTF16(this Span<char> s) {
19-
foreach (var c in s) {
20-
if (!c.IsUTF16()) return false;
21-
}
22-
return true;
23-
}
24-
public static bool IsUTF16(this ReadOnlyMemory<char> s) => IsUTF16(s.Span);
25-
public static bool IsUTF16(this Memory<char> s) => IsUTF16(s.Span);
3+
// This may not be needed? Placeholder
4+
namespace SimdUnicode
5+
{
6+
public static class UTF16
7+
{
268
}
279
}

0 commit comments

Comments
 (0)