Skip to content

Commit edcb3ca

Browse files
committed
Saving...
1 parent 06fea92 commit edcb3ca

File tree

4 files changed

+67
-12
lines changed

4 files changed

+67
-12
lines changed

README.md

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,23 @@ This is a fast C# library to process unicode strings.
44

55
*It is currently not meant to be usable.*
66

7-
Our goal is to provide faster methods than
8-
https://learn.microsoft.com/en-us/dotnet/api/system.text.encoding?view=net-7.0
7+
## Motivation
8+
9+
The most important immediate goal would be to speed up the
10+
`Utf8Utility.GetPointerToFirstInvalidByte` function.
11+
12+
https://github.com/dotnet/runtime/blob/4d709cd12269fcbb3d0fccfb2515541944475954/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs
13+
14+
15+
(We may need to speed up `Ascii.GetIndexOfFirstNonAsciiByte` first.)
16+
17+
The question is whether we could do it using this routine:
18+
19+
* John Keiser, Daniel Lemire, [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), Software: Practice and Experience 51 (5), 2021
20+
21+
Our generic implementation is available there: https://github.com/simdutf/simdutf/blob/master/src/generic/utf8_validation/utf8_lookup4_algorithm.h
22+
23+
Porting it to C# is no joke, but doable.
924

1025
## Requirements
1126

@@ -48,4 +63,12 @@ We recommend you use `dotnet format`. E.g.,
4863
```
4964
cd test
5065
dotnet format
51-
```
66+
```
67+
68+
69+
## More reading
70+
71+
72+
https://github.com/dotnet/coreclr/pull/21948/files#diff-2a22774bd6bff8e217ecbb3a41afad033ce0ca0f33645e9d8f5bdf7c9e3ac248
73+
74+
https://github.com/dotnet/runtime/issues/41699

benchmark/Benchmark.cs

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,35 @@
55
using System.Text;
66
using System.Runtime;
77
using System.Runtime.InteropServices;
8+
using System.Buffers;
9+
810
namespace SimdUnicodeBenchmarks
911
{
12+
13+
// See https://github.com/dotnet/performance/blob/cea924dd0639057c1062444a642a470deef96158/src/benchmarks/micro/libraries/System.Text.Encoding/Perf.Ascii.cs#L38
14+
// for a standard benchmark
1015
public class Checker
1116
{
1217
List<char[]> names;
13-
List<bool> results;
18+
List<bool> results;
19+
public static bool RuntimeIsAsciiApproach(ReadOnlySpan<char> s)
20+
{
21+
// The runtime as of NET 8.0 has a dedicated method for this, but
22+
// it is not available prior to that, so let us branch.
23+
#if NET8_0_OR_GREATER
24+
return Ascii.IsValid(s);
25+
#else
26+
foreach (char c in s)
27+
{
28+
if (c >= 128)
29+
{
30+
return false;
31+
}
32+
}
1433

34+
return true;
35+
#endif
36+
}
1537
public static char[] GetRandomASCIIString(uint n)
1638
{
1739
var allowedChars = "abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNOPQRSTUVWXYZ01234567é89";
@@ -72,7 +94,7 @@ public void RuntimeIsAscii()
7294
int count = 0;
7395
foreach (char[] name in names)
7496
{
75-
results[count] = (Encoding.ASCII.GetByteCount(name) == name.Length);
97+
results[count] = RuntimeIsAsciiApproach(name);
7698
count += 1;
7799
}
78100
}
@@ -98,9 +120,6 @@ public static void Main(string[] args)
98120

99121
}
100122
var summary = BenchmarkRunner.Run<Checker>();
101-
Console.WriteLine("The RuntimeIsAscii is too fast. The execution time does not depend on the string length.");
102-
Console.WriteLine("It is assuredly cheating..");
103-
104123
}
105124
}
106125
}

info/FUTUREWORK.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
## Motivation (later)
2+
3+
We would like to write faster functions `TranscodeToUtf8` and `TranscodeToUtf16`. Probably,
4+
the most difficult and beneficial would be `TranscodeToUtf16`.
5+
6+
7+
https://github.com/dotnet/runtime/blob/4d709cd12269fcbb3d0fccfb2515541944475954/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs#L838
8+
9+
10+
11+
Our goal is to provide faster methods than
12+
https://learn.microsoft.com/en-us/dotnet/api/system.text.encoding?view=net-7.0

test/AsciiTest.cs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
namespace tests;
2+
using System.Text;
23

34
public class AsciiTest
45
{
56
[Fact]
67
public void Test1()
78
{
8-
Assert.True(SimdUnicode.Ascii.IsAscii("absads"));
9-
Assert.False(SimdUnicode.Ascii.IsAscii("absaé"));
10-
Assert.True(SimdUnicode.Ascii.SIMDIsAscii("absads"));
11-
Assert.False(SimdUnicode.Ascii.SIMDIsAscii("absaé"));
9+
Assert.True(SimdUnicode.Ascii.IsAscii("absads12323123232131231232132132132312321321312321"));
10+
Assert.False(SimdUnicode.Ascii.IsAscii("absaé12323123232131231232132132132312321321312321"));
11+
Assert.True(SimdUnicode.Ascii.SIMDIsAscii("absads12323123232131231232132132132312321321312321"));
12+
Assert.False(SimdUnicode.Ascii.SIMDIsAscii("absaé12323123232131231232132132132312321321312321"));
1213
}
1314
}

0 commit comments

Comments
 (0)