Skip to content

Commit f844b88

Browse files
authored
Merge pull request #2 from simdutf/adding_ASCII_tests
Adding ascii tests
2 parents 19b845d + 6f0735a commit f844b88

File tree

2 files changed

+88
-7
lines changed

2 files changed

+88
-7
lines changed

src/Ascii.cs

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
using System.Runtime.InteropServices;
77

88

9+
10+
911
// Ideally, we would want to implement something that looks like
1012
// https://learn.microsoft.com/en-us/dotnet/api/system.text.asciiencoding?view=net-7.0
1113
//
@@ -63,7 +65,7 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
6365
{
6466
// instead of a load, we could have set it to zero, like so...
6567
// total = Vector128<ushort>.Zero;
66-
// or to a custome value like this:
68+
// or to a custom value like this:
6769
// total = DuplicateToVector128((char)0);
6870
Vector128<ushort> total = AdvSimd.LoadVector128((ushort*)pStart);
6971
i += 8;
@@ -90,21 +92,26 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
9092
fixed (char* pStart = &MemoryMarshal.GetReference(s))
9193
{
9294
int i = 0;
93-
if (s.Length > 8)
95+
if (s.Length > 16) // Adjusted for the unrolled loop
9496
{
9597
Vector128<ushort> total = Sse41.LoadDquVector128((ushort*)pStart);
9698
i += 8;
97-
// unrolling could be useful here:
98-
for (; i + 7 < s.Length; i += 8)
99+
100+
// Unrolling the loop by 2x
101+
for (; i + 15 < s.Length; i += 16)
99102
{
100-
Vector128<ushort> raw = Sse41.LoadDquVector128((ushort*)pStart + i);
101-
total = Sse2.Or(total, raw);
103+
Vector128<ushort> raw1 = Sse41.LoadDquVector128((ushort*)pStart + i);
104+
Vector128<ushort> raw2 = Sse41.LoadDquVector128((ushort*)pStart + i + 8);
105+
106+
total = Sse2.Or(total, raw1);
107+
total = Sse2.Or(total, raw2);
102108
}
109+
103110
Vector128<ushort> b127 = Vector128.Create((ushort)127);
104111
Vector128<ushort> b = Sse41.Max(b127, total);
105112
Vector128<ushort> b16 = Sse41.CompareEqual(b, b127);
106113
int movemask = Sse2.MoveMask(b16.AsByte());
107-
if (movemask != 0xfffff)
114+
if (movemask != 0xffff)
108115
{
109116
return false;
110117
}

test/AsciiTest.cs

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,80 @@ public void Test1()
99
Assert.True(SimdUnicode.Ascii.IsAscii("absads12323123232131231232132132132312321321312321"));
1010
Assert.False(SimdUnicode.Ascii.IsAscii("absaé12323123232131231232132132132312321321312321"));
1111
Assert.True(SimdUnicode.Ascii.SIMDIsAscii("absads12323123232131231232132132132312321321312321"));
12+
Assert.True(SimdUnicode.Ascii.SIMDIsAscii("12345678"));
13+
Assert.True(SimdUnicode.Ascii.SIMDIsAscii("123456789"));
14+
Assert.True(SimdUnicode.Ascii.SIMDIsAscii("1234567890123456"));
1215
Assert.False(SimdUnicode.Ascii.SIMDIsAscii("absaé12323123232131231232132132132312321321312321"));
16+
Assert.False(SimdUnicode.Ascii.SIMDIsAscii("absa12323123232131231232132132132312321321312321é"));
17+
}
18+
19+
[Fact]
20+
public void HardCodedSequencesTest()
21+
{
22+
string[] goodsequences = {
23+
"a",
24+
"abcde12345",
25+
"\x71",
26+
"\x75\x4c",
27+
"\x7f\x4c\x23\x3c\x3a\x6f\x5d\x44\x13\x70"
28+
};
29+
30+
string[] badsequences = {
31+
"\xc3\x28",
32+
"\xa0\xa1",
33+
"\xe2\x28\xa1",
34+
"\xe2\x82\x28",
35+
"\xf0\x28\x8c\xbc",
36+
"\xf0\x90\x28\xbc",
37+
"\xf0\x28\x8c\x28",
38+
"\xc0\x9f",
39+
"\xf5\xff\xff\xff",
40+
"\xed\xa0\x81",
41+
"\xf8\x90\x80\x80\x80",
42+
"123456789012345\xed",
43+
"123456789012345\xf1",
44+
"123456789012345\xc2",
45+
"\xC2\x7F",
46+
"\xce",
47+
"\xce\xba\xe1",
48+
"\xce\xba\xe1\xbd",
49+
"\xce\xba\xe1\xbd\xb9\xcf",
50+
"\xce\xba\xe1\xbd\xb9\xcf\x83\xce",
51+
"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce",
52+
"\xdf",
53+
"\xef\xbf",
54+
"\x80",
55+
"\x91\x85\x95\x9e",
56+
"\x6c\x02\x8e\x18",
57+
"\x25\x5b\x6e\x2c\x32\x2c\x5b\x5b\x33\x2c\x34\x2c\x05\x29\x2c\x33\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5d\x2c\x35\x2e\x33\x2c\x39\x2e\x33\x2c\x37\x2e\x33\x2c\x39\x2e\x34\x2c\x37\x2e\x33\x2c\x39\x2e\x33\x2c\x37\x2e\x33\x2c\x39\x2e\x34\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x20\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x23\x0a\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x7e\x7e\x0a\x0a\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5d\x2c\x37\x2e\x33\x2c\x39\x2e\x33\x2c\x37\x2e\x33\x2c\x39\x2e\x34\x2c\x37\x2e\x33\x2c\x39\x2e\x33\x2c\x37\x2e\x33\x2c\x39\x2e\x34\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x01\x01\x80\x01\x01\x01\x79\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01",
58+
"[[[[[[[[[[[[[[[\x80\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x010\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01",
59+
"\x20\x0b\x01\x01\x01\x64\x3a\x64\x3a\x64\x3a\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x30\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x80\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01",
60+
"\x80",
61+
"\x90",
62+
"\xa1",
63+
"\xb2",
64+
"\xc3",
65+
"\xd4",
66+
"\xe5",
67+
"\xf6",
68+
"\xc3\xb1",
69+
"\xe2\x82\xa1",
70+
"\xf0\x90\x8c\xbc",
71+
"\xc2\x80",
72+
"\xf0\x90\x80\x80",
73+
"\xee\x80\x80",
74+
"\xef\xbb\xbf"};
75+
76+
foreach (var sequence in goodsequences)
77+
{
78+
Assert.True(SimdUnicode.Ascii.IsAscii(sequence), "Expected valid ASCII sequence");
79+
Assert.True(SimdUnicode.Ascii.SIMDIsAscii(sequence), "Expected SIMDIsAscii to validate ASCII sequence");
80+
}
81+
82+
foreach (var sequence in badsequences)
83+
{
84+
Assert.False(SimdUnicode.Ascii.IsAscii(sequence), "Expected non-valid ASCII sequence");
85+
Assert.False(SimdUnicode.Ascii.SIMDIsAscii(sequence), "Expected SIMDIsAscii to invalidate non-ASCII sequence");
86+
}
1387
}
1488
}

0 commit comments

Comments
 (0)