Skip to content

Commit cb45af9

Browse files
lilinustannergoodingstephentoub
authored
Implement IUtf8SpanParsable on Char and Rune (#105773)
* Implement IUtf8SpanParsable on Rune * Implement IUtf8SpanParsable on char * Minor char and rune fixes * Add tests * Add docs to Char * Add missing source file in project file * Apply suggestions from code review --------- Co-authored-by: Tanner Gooding <[email protected]> Co-authored-by: Stephen Toub <[email protected]>
1 parent 85a54f2 commit cb45af9

File tree

7 files changed

+160
-1
lines changed

7 files changed

+160
-1
lines changed

src/libraries/Common/tests/System/GenericMathHelpers.cs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,14 @@ public static class SpanParsableHelper<TSelf>
463463
public static bool TryParse(ReadOnlySpan<char> s, IFormatProvider provider, out TSelf result) => TSelf.TryParse(s, provider, out result);
464464
}
465465

466+
public static class Utf8SpanParsableHelper<TSelf>
467+
where TSelf : IUtf8SpanParsable<TSelf>
468+
{
469+
public static TSelf Parse(ReadOnlySpan<byte> s, IFormatProvider provider) => TSelf.Parse(s, provider);
470+
471+
public static bool TryParse(ReadOnlySpan<byte> s, IFormatProvider provider, out TSelf result) => TSelf.TryParse(s, provider, out result);
472+
}
473+
466474
public static class SubtractionOperatorsHelper<TSelf, TOther, TResult>
467475
where TSelf : ISubtractionOperators<TSelf, TOther, TResult>
468476
{

src/libraries/System.Private.CoreLib/src/System/Char.cs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ public readonly struct Char
2929
IMinMaxValue<char>,
3030
IUnsignedNumber<char>,
3131
IUtf8SpanFormattable,
32+
IUtf8SpanParsable<char>,
3233
IUtfChar<char>,
3334
IBinaryIntegerParseAndFormatInfo<char>
3435
{
@@ -230,6 +231,38 @@ internal static bool TryParse(ReadOnlySpan<char> s, out char result)
230231
return true;
231232
}
232233

234+
/// <inheritdoc cref="IUtf8SpanParsable{TSelf}.Parse(ReadOnlySpan{byte}, IFormatProvider?)" />
235+
static char IUtf8SpanParsable<char>.Parse(ReadOnlySpan<byte> utf8Text, IFormatProvider? provider)
236+
{
237+
if (Rune.DecodeFromUtf8(utf8Text, out Rune rune, out int bytesConsumed) != Buffers.OperationStatus.Done ||
238+
bytesConsumed != utf8Text.Length)
239+
{
240+
ThrowHelper.ThrowFormatInvalidString();
241+
}
242+
243+
if (!rune.IsBmp)
244+
{
245+
Number.ThrowOverflowException<char>();
246+
}
247+
248+
return (char)rune.Value;
249+
}
250+
251+
/// <inheritdoc cref="IUtf8SpanParsable{TSelf}.TryParse(ReadOnlySpan{byte}, IFormatProvider?, out TSelf)" />
252+
static bool IUtf8SpanParsable<char>.TryParse(ReadOnlySpan<byte> utf8Text, IFormatProvider? provider, out char result)
253+
{
254+
if (Rune.DecodeFromUtf8(utf8Text, out Rune rune, out int bytesConsumed) != Buffers.OperationStatus.Done ||
255+
bytesConsumed != utf8Text.Length ||
256+
!rune.IsBmp)
257+
{
258+
result = '\0';
259+
return false;
260+
}
261+
262+
result = (char)rune.Value;
263+
return true;
264+
}
265+
233266
//
234267
// Static Methods
235268
//

src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ namespace System.Text
3232
#pragma warning disable SA1001 // Commas should be spaced correctly
3333
, ISpanFormattable
3434
, IUtf8SpanFormattable
35+
, IUtf8SpanParsable<Rune>
3536
#pragma warning restore SA1001
3637
#endif
3738
{
@@ -938,6 +939,33 @@ bool ISpanFormattable.TryFormat(Span<char> destination, out int charsWritten, Re
938939
bool IUtf8SpanFormattable.TryFormat(Span<byte> utf8Destination, out int bytesWritten, ReadOnlySpan<char> format, IFormatProvider? provider) =>
939940
TryEncodeToUtf8(utf8Destination, out bytesWritten);
940941

942+
/// <inheritdoc cref="IUtf8SpanParsable{TSelf}.TryParse(ReadOnlySpan{byte}, IFormatProvider?, out TSelf)" />
943+
static bool IUtf8SpanParsable<Rune>.TryParse(ReadOnlySpan<byte> utf8Text, IFormatProvider? provider, out Rune result)
944+
{
945+
if (DecodeFromUtf8(utf8Text, out result, out int bytesConsumed) == OperationStatus.Done)
946+
{
947+
if (bytesConsumed == utf8Text.Length)
948+
{
949+
return true;
950+
}
951+
952+
result = ReplacementChar;
953+
}
954+
955+
return false;
956+
}
957+
958+
/// <inheritdoc cref="IUtf8SpanParsable{TSelf}.Parse(ReadOnlySpan{byte}, IFormatProvider?)" />
959+
static Rune IUtf8SpanParsable<Rune>.Parse(ReadOnlySpan<byte> utf8Text, System.IFormatProvider? provider)
960+
{
961+
if (DecodeFromUtf8(utf8Text, out Rune result, out int bytesConsumed) != OperationStatus.Done || bytesConsumed != utf8Text.Length)
962+
{
963+
ThrowHelper.ThrowFormatInvalidString();
964+
}
965+
966+
return result;
967+
}
968+
941969
string IFormattable.ToString(string? format, IFormatProvider? formatProvider) => ToString();
942970
#endif
943971

src/libraries/System.Runtime/ref/System.Runtime.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1159,6 +1159,8 @@ public CannotUnloadAppDomainException(string? message, System.Exception? innerEx
11591159
static char System.ISpanParsable<char>.Parse(System.ReadOnlySpan<char> s, System.IFormatProvider? provider) { throw null; }
11601160
static bool System.ISpanParsable<char>.TryParse(System.ReadOnlySpan<char> s, System.IFormatProvider? provider, out char result) { throw null; }
11611161
bool System.IUtf8SpanFormattable.TryFormat(System.Span<byte> utf8Destination, out int bytesWritten, System.ReadOnlySpan<char> format, System.IFormatProvider? provider) { throw null; }
1162+
static char System.IUtf8SpanParsable<char>.Parse(System.ReadOnlySpan<byte> utf8Text, System.IFormatProvider? provider) { throw null; }
1163+
static bool System.IUtf8SpanParsable<char>.TryParse(System.ReadOnlySpan<byte> utf8Text, System.IFormatProvider? provider, out char result) { throw null; }
11621164
static char System.Numerics.IAdditionOperators<char, char, char>.operator +(char left, char right) { throw null; }
11631165
static char System.Numerics.IAdditionOperators<char, char, char>.operator checked +(char left, char right) { throw null; }
11641166
int System.Numerics.IBinaryInteger<char>.GetByteCount() { throw null; }
@@ -15602,7 +15604,7 @@ public enum NormalizationForm
1560215604
[System.Runtime.Versioning.UnsupportedOSPlatformAttribute("browser")]
1560315605
FormKD = 6,
1560415606
}
15605-
public readonly partial struct Rune : System.IComparable, System.IComparable<System.Text.Rune>, System.IEquatable<System.Text.Rune>, System.IFormattable, System.ISpanFormattable, System.IUtf8SpanFormattable
15607+
public readonly partial struct Rune : System.IComparable, System.IComparable<System.Text.Rune>, System.IEquatable<System.Text.Rune>, System.IFormattable, System.ISpanFormattable, System.IUtf8SpanFormattable, System.IUtf8SpanParsable<System.Text.Rune>
1560615608
{
1560715609
private readonly int _dummyPrimitive;
1560815610
public Rune(char ch) { throw null; }
@@ -15658,6 +15660,8 @@ public enum NormalizationForm
1565815660
string System.IFormattable.ToString(string? format, System.IFormatProvider? formatProvider) { throw null; }
1565915661
bool System.ISpanFormattable.TryFormat(System.Span<char> destination, out int charsWritten, System.ReadOnlySpan<char> format, System.IFormatProvider? provider) { throw null; }
1566015662
bool System.IUtf8SpanFormattable.TryFormat(System.Span<byte> utf8Destination, out int bytesWritten, System.ReadOnlySpan<char> format, System.IFormatProvider? provider) { throw null; }
15663+
static System.Text.Rune System.IUtf8SpanParsable<System.Text.Rune>.Parse(System.ReadOnlySpan<byte> utf8Text, System.IFormatProvider? provider) { throw null; }
15664+
static bool System.IUtf8SpanParsable<System.Text.Rune>.TryParse(System.ReadOnlySpan<byte> utf8Text, System.IFormatProvider? provider, out System.Text.Rune result) { throw null; }
1566115665
public static System.Text.Rune ToLower(System.Text.Rune value, System.Globalization.CultureInfo culture) { throw null; }
1566215666
public static System.Text.Rune ToLowerInvariant(System.Text.Rune value) { throw null; }
1566315667
public override string ToString() { throw null; }

src/libraries/System.Runtime/tests/System.Runtime.Tests/NlsTests/System.Runtime.Nls.Tests.csproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
Link="System\Uri.MethodsTests.cs" />
4141
<Compile Include="$(CommonTestPath)System\EnumTypes.cs"
4242
Link="Common\System\EnumTypes.cs" />
43+
<Compile Include="$(CommonTestPath)System\GenericMathHelpers.cs"
44+
Link="Common\System\GenericMathHelpers.cs" />
4345
<Compile Include="$(CommonTestPath)System\MockType.cs"
4446
Link="Common\System\MockType.cs" />
4547
<Compile Include="$(CommonTestPath)Tests\System\StringTests.cs"

src/libraries/System.Runtime/tests/System.Runtime.Tests/System/CharTests.cs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,6 +1028,48 @@ public static void Parse_Invalid(string s, Type exceptionType)
10281028
Assert.Throws(exceptionType, () => char.Parse(s));
10291029
}
10301030

1031+
[Theory]
1032+
[InlineData(new byte[] { 0x30 }, '\u0030')] // ASCII byte
1033+
[InlineData(new byte[] { 0xC3, 0x90 }, '\u00d0')] // [ C3 90 ] is U+00D0 LATIN CAPITAL LETTER ETH
1034+
[InlineData(new byte[] { 0xE2, 0x88, 0xB4 }, '\u2234')] // [ E2 88 B4 ] is U+2234 THEREFORE
1035+
public static void ParseUtf8(byte[] data, char expectedChar)
1036+
{
1037+
Assert.Equal(expectedChar, Utf8SpanParsableHelper<char>.Parse(data, null));
1038+
Assert.True(Utf8SpanParsableHelper<char>.TryParse(data, null, out char actualChar));
1039+
Assert.Equal(expectedChar, actualChar);
1040+
}
1041+
1042+
[Theory]
1043+
[InlineData(new byte[0], typeof(FormatException))] // empty buffer
1044+
[InlineData(new byte[] { 0x30, 0x40, 0x50 }, typeof(FormatException))] // Multiple ASCII bytes
1045+
[InlineData(new byte[] { 0x80 }, typeof(FormatException))] // standalone continuation byte
1046+
[InlineData(new byte[] { 0x80, 0x80, 0x80 }, typeof(FormatException))] // standalone continuation byte
1047+
[InlineData(new byte[] { 0xC1 }, typeof(FormatException))] // C1 is never a valid UTF-8 byte
1048+
[InlineData(new byte[] { 0xF5 }, typeof(FormatException))] // F5 is never a valid UTF-8 byte
1049+
[InlineData(new byte[] { 0xC2 }, typeof(FormatException))] // C2 is a valid byte; expecting it to be followed by a continuation byte
1050+
[InlineData(new byte[] { 0xED }, typeof(FormatException))] // ED is a valid byte; expecting it to be followed by a continuation byte
1051+
[InlineData(new byte[] { 0xF4 }, typeof(FormatException))] // F4 is a valid byte; expecting it to be followed by a continuation byte
1052+
[InlineData(new byte[] { 0xC2, 0xC2 }, typeof(FormatException))] // C2 not followed by continuation byte
1053+
[InlineData(new byte[] { 0xC1, 0xBF }, typeof(FormatException))] // [ C1 BF ] is overlong 2-byte sequence, all overlong sequences have maximal invalid subsequence length 1
1054+
[InlineData(new byte[] { 0xE0, 0x9F }, typeof(FormatException))] // [ E0 9F ] is overlong 3-byte sequence, all overlong sequences have maximal invalid subsequence length 1
1055+
[InlineData(new byte[] { 0xE0, 0xA0 }, typeof(FormatException))] // [ E0 A0 ] is valid 2-byte start of 3-byte sequence
1056+
[InlineData(new byte[] { 0xED, 0x9F }, typeof(FormatException))] // [ ED 9F ] is valid 2-byte start of 3-byte sequence
1057+
[InlineData(new byte[] { 0xED, 0xBF }, typeof(FormatException))] // [ ED BF ] would place us in UTF-16 surrogate range, all surrogate sequences have maximal invalid subsequence length 1
1058+
[InlineData(new byte[] { 0xEE, 0x80 }, typeof(FormatException))] // [ EE 80 ] is valid 2-byte start of 3-byte sequence
1059+
[InlineData(new byte[] { 0xF0, 0x8F }, typeof(FormatException))] // [ F0 8F ] is overlong 4-byte sequence, all overlong sequences have maximal invalid subsequence length 1
1060+
[InlineData(new byte[] { 0xF0, 0x90 }, typeof(FormatException))] // [ F0 90 ] is valid 2-byte start of 4-byte sequence
1061+
[InlineData(new byte[] { 0xF4, 0x90 }, typeof(FormatException))] // [ F4 90 ] would place us beyond U+10FFFF, all such sequences have maximal invalid subsequence length 1
1062+
[InlineData(new byte[] { 0xE2, 0x88, 0xC0 }, typeof(FormatException))] // [ E2 88 ] followed by non-continuation byte, maximal invalid subsequence length 2
1063+
[InlineData(new byte[] { 0xF0, 0x9F, 0x98 }, typeof(FormatException))] // [ F0 9F 98 ] is valid 3-byte start of 4-byte sequence
1064+
[InlineData(new byte[] { 0xF0, 0x9F, 0x98, 0x20 }, typeof(FormatException))] // [ F0 9F 98 ] followed by non-continuation byte, maximal invalid subsequence length 3
1065+
[InlineData(new byte[] { 0xF0, 0x9F, 0x98, 0xB2 }, typeof(OverflowException))] // [ F0 9F 98 B2 ] is U+1F632 ASTONISHED FACE; outside char range
1066+
public static void ParseUtf8_Invalid(byte[] data, Type exceptionType)
1067+
{
1068+
Assert.Throws(exceptionType, () => Utf8SpanParsableHelper<char>.Parse(data, null));
1069+
Assert.False(Utf8SpanParsableHelper<char>.TryParse(data, null, out char actualChar));
1070+
Assert.Equal('\0', actualChar);
1071+
}
1072+
10311073
private static IEnumerable<char> GetTestCharsNotInCategory(params UnicodeCategory[] categories)
10321074
{
10331075
Assert.Equal(s_latinTestSet.Length, s_unicodeTestSet.Length);

src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/RuneTests.cs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,48 @@ public static void DecodeFromUtf8(byte[] data, OperationStatus expectedOperation
248248
Assert.Equal(expectedBytesConsumed, actualBytesConsumed);
249249
}
250250

251+
[Theory]
252+
[InlineData(new byte[] { 0x30 }, 0x0030)] // ASCII byte
253+
[InlineData(new byte[] { 0xC3, 0x90 }, 0x00D0)] // [ C3 90 ] is U+00D0 LATIN CAPITAL LETTER ETH
254+
[InlineData(new byte[] { 0xE2, 0x88, 0xB4 }, 0x2234)] // [ E2 88 B4 ] is U+2234 THEREFORE
255+
[InlineData(new byte[] { 0xF0, 0x9F, 0x98, 0xB2 }, 0x1F632)] // [ F0 9F 98 B2 ] is U+1F632 ASTONISHED FACE
256+
public static void ParseUtf8(byte[] data, int expectedRuneValue)
257+
{
258+
Assert.Equal(expectedRuneValue, Utf8SpanParsableHelper<Rune>.Parse(data, null).Value);
259+
Assert.True(Utf8SpanParsableHelper<Rune>.TryParse(data, null, out Rune actualRune));
260+
Assert.Equal(expectedRuneValue, actualRune.Value);
261+
}
262+
263+
[Theory]
264+
[InlineData(new byte[0])] // empty buffer
265+
[InlineData(new byte[] { 0x30, 0x40, 0x50 })] // Multiple ASCII bytes
266+
[InlineData(new byte[] { 0x80 })] // standalone continuation byte
267+
[InlineData(new byte[] { 0x80, 0x80, 0x80 })] // standalone continuation byte
268+
[InlineData(new byte[] { 0xC1 })] // C1 is never a valid UTF-8 byte
269+
[InlineData(new byte[] { 0xF5 })] // F5 is never a valid UTF-8 byte
270+
[InlineData(new byte[] { 0xC2 })] // C2 is a valid byte; expecting it to be followed by a continuation byte
271+
[InlineData(new byte[] { 0xED })] // ED is a valid byte; expecting it to be followed by a continuation byte
272+
[InlineData(new byte[] { 0xF4 })] // F4 is a valid byte; expecting it to be followed by a continuation byte
273+
[InlineData(new byte[] { 0xC2, 0xC2 })] // C2 not followed by continuation byte
274+
[InlineData(new byte[] { 0xC1, 0xBF })] // [ C1 BF ] is overlong 2-byte sequence, all overlong sequences have maximal invalid subsequence length 1
275+
[InlineData(new byte[] { 0xE0, 0x9F })] // [ E0 9F ] is overlong 3-byte sequence, all overlong sequences have maximal invalid subsequence length 1
276+
[InlineData(new byte[] { 0xE0, 0xA0 })] // [ E0 A0 ] is valid 2-byte start of 3-byte sequence
277+
[InlineData(new byte[] { 0xED, 0x9F })] // [ ED 9F ] is valid 2-byte start of 3-byte sequence
278+
[InlineData(new byte[] { 0xED, 0xBF })] // [ ED BF ] would place us in UTF-16 surrogate range, all surrogate sequences have maximal invalid subsequence length 1
279+
[InlineData(new byte[] { 0xEE, 0x80 })] // [ EE 80 ] is valid 2-byte start of 3-byte sequence
280+
[InlineData(new byte[] { 0xF0, 0x8F })] // [ F0 8F ] is overlong 4-byte sequence, all overlong sequences have maximal invalid subsequence length 1
281+
[InlineData(new byte[] { 0xF0, 0x90 })] // [ F0 90 ] is valid 2-byte start of 4-byte sequence
282+
[InlineData(new byte[] { 0xF4, 0x90 })] // [ F4 90 ] would place us beyond U+10FFFF, all such sequences have maximal invalid subsequence length 1
283+
[InlineData(new byte[] { 0xE2, 0x88, 0xC0 })] // [ E2 88 ] followed by non-continuation byte, maximal invalid subsequence length 2
284+
[InlineData(new byte[] { 0xF0, 0x9F, 0x98 })] // [ F0 9F 98 ] is valid 3-byte start of 4-byte sequence
285+
[InlineData(new byte[] { 0xF0, 0x9F, 0x98, 0x20 })] // [ F0 9F 98 ] followed by non-continuation byte, maximal invalid subsequence length 3
286+
public static void ParseUtf8_Invalid(byte[] data)
287+
{
288+
Assert.Throws<FormatException>(() => Utf8SpanParsableHelper<Rune>.Parse(data, null));
289+
Assert.False(Utf8SpanParsableHelper<Rune>.TryParse(data, null, out Rune actualRune));
290+
Assert.Equal(Rune.ReplacementChar, actualRune);
291+
}
292+
251293
[Theory]
252294
[InlineData(new byte[0], OperationStatus.NeedMoreData, 0xFFFD, 0)] // empty buffer
253295
[InlineData(new byte[] { 0x30 }, OperationStatus.Done, 0x0030, 1)] // ASCII byte

0 commit comments

Comments
 (0)