44using System . Runtime . CompilerServices ;
55using System . Runtime . InteropServices ;
66using System . Runtime . Intrinsics ;
7+ using System . Runtime . Intrinsics . Arm ;
78using System . Runtime . Intrinsics . X86 ;
89
910namespace Ramstack . Globbing . Internal ;
@@ -172,14 +173,46 @@ static void ConvertPathToPosixStyleImpl(ref char p, nint length)
172173 }
173174 while ( i < tail ) ;
174175
176+ //
175177 // Process remaining chars
176178 // NOTE: An extra one write for the 'length == Vector128<ushort>.Count'
179+ //
177180
178181 value = LoadVector128 ( ref p , tail ) ;
179182 mask = Sse2 . CompareEqual ( value , backslash ) ;
180183 result = Sse41 . BlendVariable ( value , slash , mask ) ;
181184 WriteVector128 ( ref p , tail , result ) ;
182185 }
186+ else if ( AdvSimd . IsSupported && length >= Vector128 < ushort > . Count )
187+ {
188+ Vector128 < ushort > value ;
189+ Vector128 < ushort > mask ;
190+ Vector128 < ushort > result ;
191+
192+ var slash = Vector128 . Create ( ( ushort ) '/' ) ;
193+ var backslash = Vector128 . Create ( ( ushort ) '\\ ' ) ;
194+ var tail = length - Vector128 < ushort > . Count ;
195+
196+ do
197+ {
198+ value = LoadVector128 ( ref p , i ) ;
199+ mask = AdvSimd . CompareEqual ( value , backslash ) ;
200+ result = AdvSimd . BitwiseSelect ( mask , slash , value ) ;
201+ WriteVector128 ( ref p , i , result ) ;
202+
203+ i += Vector128 < ushort > . Count ;
204+ }
205+ while ( i < tail ) ;
206+
207+ //
208+ // Process remaining chars
209+ // NOTE: An extra one write for the 'length == Vector128<ushort>.Count'
210+ //
211+ value = LoadVector128 ( ref p , tail ) ;
212+ mask = AdvSimd . CompareEqual ( value , backslash ) ;
213+ result = AdvSimd . BitwiseSelect ( mask , slash , value ) ;
214+ WriteVector128 ( ref p , tail , result ) ;
215+ }
183216 else
184217 {
185218 for ( ; i < length ; i ++ )
@@ -198,10 +231,10 @@ static void ConvertPathToPosixStyleImpl(ref char p, nint length)
198231 /// <returns>
199232 /// A 256-bit bitmask for escaping characters.
200233 /// </returns>
201- private static Vector256 < ushort > CreateAllowEscaping256Bitmask ( MatchFlags flags )
234+ private static Vector256 < ushort > CreateBackslash256Bitmask ( MatchFlags flags )
202235 {
203236 var mask = Vector256 < ushort > . Zero ;
204- if ( flags ! = MatchFlags . Windows )
237+ if ( flags = = MatchFlags . Windows )
205238 mask = Vector256 < ushort > . AllBitsSet ;
206239
207240 return mask ;
@@ -214,10 +247,10 @@ private static Vector256<ushort> CreateAllowEscaping256Bitmask(MatchFlags flags)
214247 /// <returns>
215248 /// A 128-bit bitmask for escaping characters.
216249 /// </returns>
217- private static Vector128 < ushort > CreateAllowEscaping128Bitmask ( MatchFlags flags )
250+ private static Vector128 < ushort > CreateBackslash128Bitmask ( MatchFlags flags )
218251 {
219252 var mask = Vector128 < ushort > . Zero ;
220- if ( flags ! = MatchFlags . Windows )
253+ if ( flags = = MatchFlags . Windows )
221254 mask = Vector128 < ushort > . AllBitsSet ;
222255
223256 return mask ;
@@ -301,15 +334,37 @@ public PathSegmentIterator() =>
301334
302335 while ( ( int ) _position < length )
303336 {
304- if ( ( Avx2 . IsSupported || Sse2 . IsSupported ) && _mask != 0 )
337+ if ( ( Avx2 . IsSupported || Sse2 . IsSupported || AdvSimd . IsSupported ) && _mask != 0 )
305338 {
306339 var offset = BitOperations . TrailingZeroCount ( _mask ) ;
307- _last = ( int ) ( _position + ( nint ) ( ( uint ) offset >> 1 ) ) ;
340+ if ( AdvSimd . IsSupported )
341+ {
342+ //
343+ // On ARM, ExtractMostSignificantBits returns a mask where each bit
344+ // represents one vector element (1 bit per ushort), so offset
345+ // directly corresponds to the element index
346+ //
347+ _last = ( int ) ( _position + ( nint ) ( uint ) offset ) ;
308348
309- //
310- // Clear the bits for the current separator to process the next position in the mask
311- //
312- _mask &= ~ ( 0b_11u << offset ) ;
349+ //
350+ // Clear the bits for the current separator
351+ //
352+ _mask &= ~ ( 1u << offset ) ;
353+ }
354+ else
355+ {
356+ //
357+ // On x86, MoveMask (and ExtractMostSignificantBits on byte-based vectors)
358+ // returns a mask where each bit represents one byte (2 bits per ushort),
359+ // so we need to divide offset by 2 to get the actual element index
360+ //
361+ _last = ( int ) ( _position + ( nint ) ( ( uint ) offset >> 1 ) ) ;
362+
363+ //
364+ // Clear the bits for the current separator
365+ //
366+ _mask &= ~ ( 0b_11u << offset ) ;
367+ }
313368
314369 //
315370 // Advance position to the next chunk when no separators remain in the mask
@@ -340,14 +395,14 @@ public PathSegmentIterator() =>
340395 if ( Avx2 . IsSupported && ( int ) _position + Vector256 < ushort > . Count <= length )
341396 {
342397 var chunk = LoadVector256 ( ref source , _position ) ;
343- var allowEscapingMask = CreateAllowEscaping256Bitmask ( flags ) ;
398+ var backslashMask = CreateBackslash256Bitmask ( flags ) ;
344399 var slash = Vector256 . Create ( ( ushort ) '/' ) ;
345400 var backslash = Vector256 . Create ( ( ushort ) '\\ ' ) ;
346401
347402 var comparison = Avx2 . Or (
348403 Avx2 . CompareEqual ( chunk , slash ) ,
349- Avx2 . AndNot (
350- allowEscapingMask ,
404+ Avx2 . And (
405+ backslashMask ,
351406 Avx2 . CompareEqual ( chunk , backslash ) ) ) ;
352407
353408 //
@@ -367,14 +422,14 @@ public PathSegmentIterator() =>
367422 else if ( Sse2 . IsSupported && ! Avx2 . IsSupported && ( int ) _position + Vector128 < ushort > . Count <= length )
368423 {
369424 var chunk = LoadVector128 ( ref source , _position ) ;
370- var allowEscapingMask = CreateAllowEscaping128Bitmask ( flags ) ;
425+ var backslashMask = CreateBackslash128Bitmask ( flags ) ;
371426 var slash = Vector128 . Create ( ( ushort ) '/' ) ;
372427 var backslash = Vector128 . Create ( ( ushort ) '\\ ' ) ;
373428
374429 var comparison = Sse2 . Or (
375430 Sse2 . CompareEqual ( chunk , slash ) ,
376- Sse2 . AndNot (
377- allowEscapingMask ,
431+ Sse2 . And (
432+ backslashMask ,
378433 Sse2 . CompareEqual ( chunk , backslash ) ) ) ;
379434
380435 //
@@ -391,6 +446,35 @@ public PathSegmentIterator() =>
391446 if ( _mask == 0 )
392447 _position += Vector128 < ushort > . Count ;
393448 }
449+ #if NET7_0_OR_GREATER
450+ else if ( AdvSimd . IsSupported && ( int ) _position + Vector128 < ushort > . Count <= length )
451+ {
452+ var chunk = LoadVector128( ref source , _position ) ;
453+ var backslashMask = CreateBackslash128Bitmask( flags ) ;
454+ var slash = Vector128 . Create ( ( ushort ) '/' ) ;
455+ var backslash = Vector128. Create( ( ushort ) '\\ ' ) ;
456+
457+ var comparison = AdvSimd. Or(
458+ AdvSimd . CompareEqual ( chunk , slash ) ,
459+ AdvSimd . And (
460+ backslashMask ,
461+ AdvSimd . CompareEqual ( chunk , backslash ) ) ) ;
462+
463+ //
464+ // Store the comparison bitmask and reuse it across iterations
465+ // as long as it contains non-zero bits.
466+ // This avoids reloading SIMD registers and repeating comparisons
467+ // on the same chunk of data.
468+ //
469+ _mask = comparison. ExtractMostSignificantBits( ) ;
470+
471+ //
472+ // Advance position to the next chunk when no separators found
473+ //
474+ if ( _mask = = 0 )
475+ _position += Vector128 < ushort > . Count ;
476+ }
477+ #endif
394478 else
395479 {
396480 for ( ; ( int ) _position < length ; _position++ )
0 commit comments