Skip to content

Commit c37e685

Browse files
authored
Add a packed variant of single-value SearchValues<string> (#118108)
1 parent 97ec362 commit c37e685

File tree

7 files changed

+529
-35
lines changed

7 files changed

+529
-35
lines changed

src/libraries/System.Memory/tests/Span/StringSearchValues.cs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,11 @@ void TestWithPoisonPages(PoisonPagePlacement poisonPlacement, int haystackLength
232232
.First(c => !values.AsSpan().ContainsAny(c, char.ToLowerInvariant(c)));
233233
}
234234

235-
TestWithDifferentMarkerChars(haystack, '\0');
235+
if (!values.Contains('\0'))
236+
{
237+
TestWithDifferentMarkerChars(haystack, '\0');
238+
}
239+
236240
TestWithDifferentMarkerChars(haystack, '\u00FC');
237241
TestWithDifferentMarkerChars(haystack, asciiNumberNotInSet);
238242
TestWithDifferentMarkerChars(haystack, asciiLetterLowerNotInSet);
@@ -407,10 +411,26 @@ public static void SimpleIndexOfAnyValues(params string[] valuesArray)
407411
valuesArray[offset] = $"{original[0]}\u00F6{original.AsSpan(1)}";
408412
TestCore(valuesArray);
409413

414+
// Test non-ASCII values over 0xFF
415+
valuesArray[offset] = $"{original}\u2049";
416+
TestCore(valuesArray);
417+
418+
valuesArray[offset] = $"\u2049{original}";
419+
TestCore(valuesArray);
420+
421+
valuesArray[offset] = $"{original[0]}\u2049{original.AsSpan(1)}";
422+
TestCore(valuesArray);
423+
410424
// Test null chars in values
411425
valuesArray[offset] = $"{original[0]}\0{original.AsSpan(1)}";
412426
TestCore(valuesArray);
413427

428+
valuesArray[offset] = $"\0{original}";
429+
TestCore(valuesArray);
430+
431+
valuesArray[offset] = $"{original}\0";
432+
TestCore(valuesArray);
433+
414434
static void TestCore(string[] valuesArray)
415435
{
416436
Values_ImplementsSearchValuesBase(StringComparison.Ordinal, valuesArray);
@@ -529,7 +549,7 @@ public static void TestIndexOfAny_RandomInputs_Stress()
529549
if (RemoteExecutor.IsSupported && Avx512F.IsSupported)
530550
{
531551
var psi = new ProcessStartInfo();
532-
psi.Environment.Add("DOTNET_EnableAVX512F", "0");
552+
psi.Environment.Add("DOTNET_EnableAVX512", "0");
533553
RemoteExecutor.Invoke(RunStress, new RemoteInvokeOptions { StartInfo = psi, TimeOut = 10 * 60 * 1000 }).Dispose();
534554
}
535555

src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,7 @@
471471
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyNonBucketizedN3.cs" />
472472
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyBase.cs" />
473473
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\MultiStringIgnoreCaseSearchValuesFallback.cs" />
474+
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesPackedThreeChars.cs" />
474475
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesThreeChars.cs" />
475476
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesFallback.cs" />
476477
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValues.cs" />
@@ -2462,10 +2463,8 @@
24622463
<Compile Include="$(CommonPath)Interop\Unix\System.Native\Interop.MountPoints.cs">
24632464
<Link>Common\Interop\Unix\System.Native\Interop.MountPoints.cs</Link>
24642465
</Compile>
2465-
<Compile Include="$(CommonPath)Interop\Linux\procfs\Interop.ProcMountInfo.cs"
2466-
Link="Common\Interop\Linux\procfs\Interop.ProcMountInfo.cs" />
2467-
<Compile Include="$(CommonPath)Interop\Linux\procfs\Interop.ProcMountInfo.TryParseMountInfoLine.cs"
2468-
Link="Common\Interop\Linux\procfs\Interop.ProcMountInfo.TryParseMountInfoLine.cs" />
2466+
<Compile Include="$(CommonPath)Interop\Linux\procfs\Interop.ProcMountInfo.cs" Link="Common\Interop\Linux\procfs\Interop.ProcMountInfo.cs" />
2467+
<Compile Include="$(CommonPath)Interop\Linux\procfs\Interop.ProcMountInfo.TryParseMountInfoLine.cs" Link="Common\Interop\Linux\procfs\Interop.ProcMountInfo.TryParseMountInfoLine.cs" />
24692468
<Compile Include="$(CommonPath)Interop\Unix\System.Native\Interop.Open.cs">
24702469
<Link>Common\Interop\Unix\System.Native\Interop.Open.cs</Link>
24712470
</Compile>
@@ -2879,4 +2878,4 @@
28792878
<Compile Include="$(MSBuildThisFileDirectory)System\Threading\Wasi\WasiPollWorld.wit.imports.wasi.io.v0_2_0.IPoll.cs" />
28802879
<Compile Include="$(MSBuildThisFileDirectory)System\Threading\Wasi\WasiPollWorld.wit.imports.wasi.io.v0_2_0.PollInterop.cs" />
28812880
</ItemGroup>
2882-
</Project>
2881+
</Project>

src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/AsciiStringSearchValuesTeddyBase.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ namespace System.Buffers
9191
//
9292
// For an alternative description of the algorithm, see
9393
// https://github.com/BurntSushi/aho-corasick/blob/8d735471fc12f0ca570cead8e17342274fae6331/src/packed/teddy/README.md
94-
// Has an O(i * m) worst-case, with the expected time closer to O(n) for good bucket distributions.
94+
// Has an O(i * m) worst-case, with the expected time closer to O(i) for good bucket distributions.
9595
internal abstract class AsciiStringSearchValuesTeddyBase<TBucketized, TStartCaseSensitivity, TCaseSensitivity> : StringSearchValuesRabinKarp<TCaseSensitivity>
9696
where TBucketized : struct, SearchValues.IRuntimeConst
9797
where TStartCaseSensitivity : struct, ICaseSensitivity // Refers to the characters being matched by Teddy

src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/StringSearchValuesHelper.cs

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using System.Runtime.CompilerServices;
77
using System.Runtime.InteropServices;
88
using System.Runtime.Intrinsics;
9+
using System.Runtime.Intrinsics.Arm;
910
using System.Text;
1011

1112
namespace System.Buffers
@@ -270,12 +271,31 @@ public static bool Equals<TValueLength>(ref char matchStart, ref readonly Single
270271
else
271272
{
272273
Debug.Assert(state.Value.Length is 2 or 3);
273-
Debug.Assert(matchStart == state.Value[0], "This should only be called after the first character has been checked");
274274

275-
// We know that the candidate is 2 or 3 characters long, and that the first character has already been checked.
276-
// We only have to to check whether the last 2 characters also match.
277275
ref byte matchByteStart = ref Unsafe.As<char, byte>(ref matchStart);
278-
return Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref matchByteStart, state.SecondReadByteOffset)) == state.Value32_1;
276+
277+
if (AdvSimd.IsSupported)
278+
{
279+
// See comments on SingleStringSearchValuesPackedThreeChars.CanSkipAnchorMatchVerification.
280+
// When running on Arm64, this helper is also used to confirm vectorized anchor matches.
281+
// We do so because we're using UnzipEven when packing inputs, which may produce false positive anchor matches.
282+
// When called from SingleStringSearchValuesThreeChars (non-packed), we could skip to the else branch instead.
283+
Debug.Assert(matchStart == state.Value[0] || (matchStart & 0xFF) == state.Value[0]);
284+
285+
uint differentBits = Unsafe.ReadUnaligned<uint>(ref matchByteStart) - state.Value32_0;
286+
differentBits |= Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref matchByteStart, state.SecondReadByteOffset)) - state.Value32_1;
287+
return differentBits == 0;
288+
}
289+
else
290+
{
291+
// Otherwise, this path is not used when confirming vectorized anchor matches.
292+
// It's only used as part of the scalar search loop, which always checks that the first character matches before calling this helper.
293+
// We know that the candidate is 2 or 3 characters long, and that the first character has already been checked.
294+
// We only have to to check whether the last 2 characters also match.
295+
Debug.Assert(matchStart == state.Value[0], "This should only be called after the first character has been checked");
296+
297+
return Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref matchByteStart, state.SecondReadByteOffset)) == state.Value32_1;
298+
}
279299
}
280300
}
281301
}
@@ -319,13 +339,32 @@ public static bool Equals<TValueLength>(ref char matchStart, ref readonly Single
319339
else
320340
{
321341
Debug.Assert(state.Value.Length is 2 or 3);
322-
Debug.Assert(TransformInput(matchStart) == state.Value[0], "This should only be called after the first character has been checked");
323342

324-
// We know that the candidate is 2 or 3 characters long, and that the first character has already been checked.
325-
// We only have to to check whether the last 2 characters also match.
326343
const uint CaseMask = ~0x200020u;
327344
ref byte matchByteStart = ref Unsafe.As<char, byte>(ref matchStart);
328-
return (Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref matchByteStart, state.SecondReadByteOffset)) & CaseMask) == state.Value32_1;
345+
346+
if (AdvSimd.IsSupported)
347+
{
348+
// See comments on SingleStringSearchValuesPackedThreeChars.CanSkipAnchorMatchVerification.
349+
// When running on Arm64, this helper is also used to confirm vectorized anchor matches.
350+
// We do so because we're using UnzipEven when packing inputs, which may produce false positive anchor matches.
351+
// When called from SingleStringSearchValuesThreeChars (non-packed), we could skip to the else branch instead.
352+
Debug.Assert(TransformInput((char)(matchStart & 0xFF)) == state.Value[0]);
353+
354+
uint differentBits = (Unsafe.ReadUnaligned<uint>(ref matchByteStart) & CaseMask) - state.Value32_0;
355+
differentBits |= (Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref matchByteStart, state.SecondReadByteOffset)) & CaseMask) - state.Value32_1;
356+
return differentBits == 0;
357+
}
358+
else
359+
{
360+
// Otherwise, this path is not used when confirming vectorized anchor matches.
361+
// It's only used as part of the scalar search loop, which always checks that the first character matches before calling this helper.
362+
// We know that the candidate is 2 or 3 characters long, and that the first character has already been checked.
363+
// We only have to to check whether the last 2 characters also match.
364+
Debug.Assert(TransformInput(matchStart) == state.Value[0], "This should only be called after the first character has been checked");
365+
366+
return (Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref matchByteStart, state.SecondReadByteOffset)) & CaseMask) == state.Value32_1;
367+
}
329368
}
330369
}
331370
}
@@ -392,7 +431,6 @@ public static bool Equals<TValueLength>(ref char matchStart, ref readonly Single
392431
else
393432
{
394433
Debug.Assert(state.Value.Length is 2 or 3);
395-
Debug.Assert((matchStart & ~0x20) == (state.Value[0] & ~0x20));
396434

397435
ref byte matchByteStart = ref Unsafe.As<char, byte>(ref matchStart);
398436
uint differentBits = (Unsafe.ReadUnaligned<uint>(ref matchByteStart) & state.ToUpperMask32_0) - state.Value32_0;

0 commit comments

Comments
 (0)