Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit a74f1db

Browse files
Port dotnet/runtime#31904 to release/3.1 (#28013)
Remove BMI2 from ASCII and UTF-16 processing hot paths, as not all processors have optimized implementations of pext/pdep
1 parent 6689dd7 commit a74f1db

File tree

3 files changed

+92
-156
lines changed

3 files changed

+92
-156
lines changed

src/System.Private.CoreLib/shared/System/Text/ASCIIUtility.cs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,10 +1009,14 @@ private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputB
10091009
{
10101010
Debug.Assert(AllCharsInUInt64AreAscii(value));
10111011

1012-
if (Bmi2.X64.IsSupported)
1012+
if (Sse2.X64.IsSupported)
10131013
{
1014-
// BMI2 will work regardless of the processor's endianness.
1015-
Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
1014+
// Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes
1015+
// [ b0 b1 b2 b3 b0 b1 b2 b3 ], then writes 4 bytes (32 bits) to the destination.
1016+
1017+
Vector128<short> vecWide = Sse2.X64.ConvertScalarToVector128UInt64(value).AsInt16();
1018+
Vector128<uint> vecNarrow = Sse2.PackUnsignedSaturate(vecWide, vecWide).AsUInt32();
1019+
Unsafe.WriteUnaligned<uint>(ref outputBuffer, Sse2.ConvertToUInt32(vecNarrow));
10161020
}
10171021
else
10181022
{
@@ -1694,14 +1698,16 @@ private static unsafe nuint WidenAsciiToUtf16_Sse2(byte* pAsciiBuffer, char* pUt
16941698
/// writes them to the output buffer with machine endianness.
16951699
/// </summary>
16961700
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1697-
private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
1701+
internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
16981702
{
16991703
Debug.Assert(AllBytesInUInt32AreAscii(value));
17001704

1701-
if (Bmi2.X64.IsSupported)
1705+
if (Sse2.X64.IsSupported)
17021706
{
1703-
// BMI2 will work regardless of the processor's endianness.
1704-
Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
1707+
Debug.Assert(BitConverter.IsLittleEndian, "SSE2 widening assumes little-endian.");
1708+
Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
1709+
Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
1710+
Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), Sse2.X64.ConvertToUInt64(vecWide));
17051711
}
17061712
else
17071713
{

src/System.Private.CoreLib/shared/System/Text/Unicode/Utf8Utility.Helpers.cs

Lines changed: 23 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
using System.Diagnostics;
77
using System.Numerics;
88
using System.Runtime.CompilerServices;
9-
using System.Runtime.Intrinsics.X86;
109
using Internal.Runtime.CompilerServices;
1110

1211
namespace System.Text.Unicode
@@ -61,47 +60,27 @@ private static uint ExtractCharFromFirstTwoByteSequence(uint value)
6160
}
6261

6362
/// <summary>
64-
/// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the input as a
63+
/// Given a machine-endian DWORD which represents four bytes of UTF-8 data, interprets the input as a
6564
/// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation.
6665
/// </summary>
6766
[MethodImpl(MethodImplOptions.AggressiveInlining)]
6867
private static uint ExtractCharsFromFourByteSequence(uint value)
6968
{
7069
if (BitConverter.IsLittleEndian)
7170
{
72-
if (Bmi2.IsSupported)
73-
{
74-
// need to reverse endianness for bit manipulation to work correctly
75-
value = BinaryPrimitives.ReverseEndianness(value);
76-
77-
// value = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
78-
// want to return [ 110110wwwwxxxxxx 110111xxxxxxxxxx ]
79-
// where wwww = uuuuu - 1
80-
81-
uint highSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000111_00111111_00110000_00000000u);
82-
uint lowSurrogateChar = Bmi2.ParallelBitExtract(value, 0b00000000_00000000_00001111_00111111u);
83-
84-
uint combined = (lowSurrogateChar << 16) + highSurrogateChar;
85-
combined -= 0x40u; // wwww = uuuuu - 1
86-
combined += 0xDC00_D800u; // add surrogate markers
87-
return combined;
88-
}
89-
else
90-
{
91-
// input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
92-
// want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
93-
// where wwww = uuuuu - 1
94-
uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
95-
retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
96-
retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
97-
retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
98-
retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
99-
retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
100-
retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
101-
retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
102-
retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
103-
return retVal;
104-
}
71+
// input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
72+
// want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
73+
// where wwww = uuuuu - 1
74+
uint retVal = (uint)(byte)value << 8; // retVal = [ 00000000 00000000 11110uuu 00000000 ]
75+
retVal |= (value & 0x0000_3F00u) >> 6; // retVal = [ 00000000 00000000 11110uuu uuzzzz00 ]
76+
retVal |= (value & 0x0030_0000u) >> 20; // retVal = [ 00000000 00000000 11110uuu uuzzzzyy ]
77+
retVal |= (value & 0x3F00_0000u) >> 8; // retVal = [ 00000000 00xxxxxx 11110uuu uuzzzzyy ]
78+
retVal |= (value & 0x000F_0000u) << 6; // retVal = [ 000000yy yyxxxxxx 11110uuu uuzzzzyy ]
79+
retVal -= 0x0000_0040u; // retVal = [ 000000yy yyxxxxxx 111100ww wwzzzzyy ]
80+
retVal -= 0x0000_2000u; // retVal = [ 000000yy yyxxxxxx 110100ww wwzzzzyy ]
81+
retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
82+
retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
83+
return retVal;
10584
}
10685
else
10786
{
@@ -135,37 +114,19 @@ private static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
135114
// input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx)
136115
// must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1
137116

138-
if (Bmi2.IsSupported)
139-
{
140-
// Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want
141-
// to use them conservatively. Here, we'll build up the scalar value (this would normally be pext) via simple
142-
// logical and arithmetic operations, and use only pdep for the expensive step of exploding the scalar across
143-
// all four output bytes.
144-
145-
uint unmaskedScalar = (value << 10) + (value >> 16) + ((0x40u) << 10) /* uuuuu = wwww + 1 */ - 0xDC00u /* remove low surrogate marker */;
146-
147-
// Now, unmaskedScalar = [ xxxxxx11 011uuuuu zzzzyyyy yyxxxxxx ]. There's a bit of unneeded junk at the beginning
148-
// that should normally be masked out via an and, but we'll just direct pdep to ignore it.
117+
value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
149118

150-
uint exploded = Bmi2.ParallelBitDeposit(unmaskedScalar, 0b00000111_00111111_00111111_00111111u); // = [ 00000uuu 00uuzzzz 00yyyyyy 00xxxxxx ]
151-
return BinaryPrimitives.ReverseEndianness(exploded + 0xF080_8080u); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
152-
}
153-
else
154-
{
155-
value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
119+
uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
120+
tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]
156121

157-
uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
158-
tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ]
122+
uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
123+
uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
124+
tempC |= tempB;
159125

160-
uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ]
161-
uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ]
162-
tempC |= tempB;
126+
uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
127+
tempD |= 0x8080_80F0u;
163128

164-
uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
165-
tempD |= 0x8080_80F0u;
166-
167-
return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
168-
}
129+
return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
169130
}
170131
else
171132
{
@@ -756,43 +717,6 @@ private static bool UInt32ThirdByteIsAscii(uint value)
756717
|| (!BitConverter.IsLittleEndian && ((value & 0x8000u) == 0));
757718
}
758719

759-
/// <summary>
760-
/// Given a DWORD which represents a buffer of 4 ASCII bytes, widen each byte to a 16-bit WORD
761-
/// and writes the resulting QWORD into the destination with machine endianness.
762-
/// </summary>
763-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
764-
private static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value)
765-
{
766-
if (Bmi2.X64.IsSupported)
767-
{
768-
// BMI2 will work regardless of the processor's endianness.
769-
Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
770-
}
771-
else
772-
{
773-
if (BitConverter.IsLittleEndian)
774-
{
775-
outputBuffer = (char)(byte)value;
776-
value >>= 8;
777-
Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
778-
value >>= 8;
779-
Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
780-
value >>= 8;
781-
Unsafe.Add(ref outputBuffer, 3) = (char)value;
782-
}
783-
else
784-
{
785-
Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
786-
value >>= 8;
787-
Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
788-
value >>= 8;
789-
Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
790-
value >>= 8;
791-
outputBuffer = (char)value;
792-
}
793-
}
794-
}
795-
796720
/// <summary>
797721
/// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
798722
/// converts those scalar values to their 3-byte UTF-8 representation and writes the

0 commit comments

Comments
 (0)