Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 1b0e8e3

Browse files
authored
Fix for incorrectly handle invalid UTF8 characters issue (#17302)
There are some behavior difference between C/C++ UTF8 encoder/decoder and Encoding.UTF8 as mentioned by #16786.
1 parent 54c2b5b commit 1b0e8e3

File tree

3 files changed

+112
-39
lines changed

3 files changed

+112
-39
lines changed

src/pal/src/locale/utf8.cpp

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ class DecoderFallbackBuffer
346346
throw ArgumentException("String 'chars' contains invalid Unicode code points.");
347347

348348
// Now we aren't going to be false, so its OK to update chars
349-
chars = &charTemp;
349+
*chars = charTemp;
350350
}
351351

352352
return true;
@@ -412,7 +412,7 @@ class DecoderFallbackBuffer
412412
class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer
413413
{
414414
// Store our default string
415-
WCHAR strDefault[4];
415+
WCHAR strDefault[2];
416416
int strDefaultLength;
417417
int fallbackCount = -1;
418418
int fallbackIndex = -1;
@@ -421,11 +421,8 @@ class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer
421421
// Construction
422422
DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback)
423423
{
424-
// 2X in case we're a surrogate pair
425424
wcscpy_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
426-
wcscat_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
427-
strDefaultLength = 2 * PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
428-
425+
strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
429426
}
430427

431428
// Fallback Methods
@@ -1081,9 +1078,14 @@ class UTF8Encoding
10811078
return begin <= c && c <= end;
10821079
}
10831080

1084-
size_t PtrDiff(void* ptr1, void* ptr2)
1081+
size_t PtrDiff(WCHAR* ptr1, WCHAR* ptr2)
10851082
{
1086-
return (BYTE*)ptr2 - (BYTE*)ptr1;
1083+
return ptr1 - ptr2;
1084+
}
1085+
1086+
size_t PtrDiff(BYTE* ptr1, BYTE* ptr2)
1087+
{
1088+
return ptr1 - ptr2;
10871089
}
10881090

10891091
void ThrowBytesOverflow()
@@ -1118,6 +1120,28 @@ class UTF8Encoding
11181120
}
11191121
}
11201122

1123+
// During GetChars we had an invalid byte sequence
1124+
// pSrc is backed up to the start of the bad sequence if we didn't have room to
1125+
// fall it back. Otherwise pSrc remains where it is.
1126+
bool FallbackInvalidByteSequence(BYTE** pSrc, int ch, DecoderFallbackBuffer* fallback, WCHAR** pTarget)
1127+
{
1128+
// Get our byte[]
1129+
BYTE* pStart = *pSrc;
1130+
BYTE* bytesUnknown;
1131+
int size = GetBytesUnknown(pStart, ch, &bytesUnknown);
1132+
1133+
// Do the actual fallback
1134+
if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size))
1135+
{
1136+
// Oops, it failed, back up to pStart
1137+
*pSrc = pStart;
1138+
return false;
1139+
}
1140+
1141+
// It worked
1142+
return true;
1143+
}
1144+
11211145
int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback)
11221146
{
11231147
// Get our byte[]
@@ -1211,7 +1235,7 @@ class UTF8Encoding
12111235
public:
12121236

12131237
UTF8Encoding(bool isThrowException)
1214-
: encoderReplacementFallback(W("\xFFFD"))
1238+
: encoderReplacementFallback(W("\xFFFD")), decoderReplacementFallback(W("\xFFFD"))
12151239
{
12161240
if (isThrowException)
12171241
{
@@ -1704,8 +1728,9 @@ class UTF8Encoding
17041728
fallback = decoderFallback->CreateFallbackBuffer();
17051729
fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
17061730
}
1707-
// This'll back us up the appropriate # of bytes if we didn't get anywhere
1708-
if (!FallbackInvalidByteSequence(pSrc, ch, fallback))
1731+
1732+
// That'll back us up the appropriate # of bytes if we didn't get anywhere
1733+
if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget))
17091734
{
17101735
// Ran out of buffer space
17111736
// Need to throw an exception?

src/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -152,57 +152,57 @@ int __cdecl main(int argc, char *argv[])
152152

153153
// Strings with errors
154154
// Incomplete 2 byte encoded character standalone
155-
W(""),
155+
W("\xFFFD"),
156156
// Incomplete 3 byte encoded character 1 byte missing standalone
157-
W(""),
157+
W("\xFFFD"),
158158
// Incomplete 3 byte encoded character 2 bytes missing standalone
159-
W(""),
159+
W("\xFFFD"),
160160
// Incomplete surrogate character 1 byte missing standalone
161-
W(""),
161+
W("\xFFFD"),
162162
// Incomplete surrogate character 2 bytes missing standalone
163-
W(""),
163+
W("\xFFFD"),
164164
// Incomplete surrogate character 3 bytes missing standalone
165-
W(""),
165+
W("\xFFFD"),
166166
// Trailing byte with no lead byte standalone
167-
W(""),
167+
W("\xFFFD"),
168168
// Incomplete 2 byte encoded character 1 byte missing between 1 byte chars
169-
W("\x0041\x0042"),
169+
W("\x0041\xFFFD\x0042"),
170170
// Incomplete 3 byte encoded character 1 byte missing between 1 byte chars
171-
W("\x0041\x0042"),
171+
W("\x0041\xFFFD\x0042"),
172172
// Incomplete 3 byte encoded character 2 bytes missing between 1 byte chars
173-
W("\x0041\x0042"),
173+
W("\x0041\xFFFD\x0042"),
174174
// Trailing byte with no lead byte between 1 byte chars
175-
W("\x0041\x0042"),
175+
W("\x0041\xFFFD\x0042"),
176176
// Incomplete 2 byte encoded character 1 byte missing before 1 byte char
177-
W("\x0042"),
177+
W("\xFFFD\x0042"),
178178
// Incomplete 3 byte encoded character 1 byte missing before 1 byte char
179-
W("\x0042"),
179+
W("\xFFFD\x0042"),
180180
// Incomplete 3 byte encoded character 2 bytes missing before 1 byte char
181-
W("\x0042"),
181+
W("\xFFFD\x0042"),
182182
// Trailing byte with no lead byte before 1 byte char
183-
W("\x0042"),
183+
W("\xFFFD\x0042"),
184184
// Incomplete 2 byte encoded character 1 byte missing after 1 byte char
185-
W("\x0041"),
185+
W("\x0041\xFFFD"),
186186
// Incomplete 3 byte encoded character 1 byte missing after 1 byte char
187-
W("\x0041"),
187+
W("\x0041\xFFFD"),
188188
// Incomplete 3 byte encoded character 2 bytes missing after 1 byte char
189-
W("\x0041"),
189+
W("\x0041\xFFFD"),
190190
// Trailing byte with no lead byte after 1 byte char
191-
W("\x0041"),
191+
W("\x0041\xFFFD"),
192192
// Incomplete 2 byte encoded character 1 byte missing between 2 byte chars
193-
W("\x0080\x00FF"),
193+
W("\x0080\xFFFD\x00FF"),
194194
// Incomplete 3 byte encoded character 1 byte missing between 2 byte chars
195-
W("\x0080\x00FF"),
195+
W("\x0080\xFFFD\x00FF"),
196196
// Incomplete 3 byte encoded character 2 bytes missing between 2 byte chars
197-
W("\x0080\x00FF"),
197+
W("\x0080\xFFFD\x00FF"),
198198
// Trailing byte with no lead byte between 2 byte chars
199-
W("\x0080\x00FF"),
199+
W("\x0080\xFFFD\x00FF"),
200200
// 2 byte encoded character in non-shortest form encodings (these are not allowed)
201-
W(""),
201+
W("\xFFFD\xFFFD"),
202202
// 3 byte encoded character in non-shortest form encodings (these are not allowed)
203-
W(""),
203+
W("\xFFFD\xFFFD"),
204204
// 4 byte encoded character in non-shortest form encodings (these are not allowed)
205-
W(""),
205+
W("\xFFFD\xFFFD\xFFFD"),
206206
};
207207

208208
for (int i = 0; i < (sizeof(utf8Strings) / sizeof(utf8Strings[0])); i++)

tests/src/Interop/StringMarshalling/UTF8/UTF8Test.cs

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,21 @@ public struct Utf8Struct
126126
public int index;
127127
}
128128

129+
unsafe struct UnmanagedStruct
130+
{
131+
public fixed byte psz[8];
132+
}
133+
134+
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi)]
135+
struct ManagedStruct
136+
{
137+
[MarshalAs(UnmanagedType.ByValTStr, SizeConst = 8)]
138+
public string str;
139+
}
140+
129141
[DllImport("UTF8TestNative", CallingConvention = CallingConvention.Cdecl)]
130142
public static extern void TestStructWithUtf8Field(Utf8Struct utfStruct);
143+
131144
public static void TestUTF8StructMarshalling(string[] utf8Strings)
132145
{
133146
Utf8Struct utf8Struct = new Utf8Struct();
@@ -137,7 +150,40 @@ public static void TestUTF8StructMarshalling(string[] utf8Strings)
137150
utf8Struct.index = i;
138151
TestStructWithUtf8Field(utf8Struct);
139152
}
140-
}
153+
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
154+
CompareWithUTF8Encoding();
155+
}
156+
157+
unsafe static void CompareWithUTF8Encoding()
158+
{
159+
// Compare results with UTF8Encoding
160+
UnmanagedStruct ums;
161+
ums.psz[0] = 0xFF;
162+
ums.psz[1] = (byte)'a';
163+
ums.psz[2] = (byte)'b';
164+
ums.psz[3] = (byte)'c';
165+
ums.psz[4] = (byte)'d';
166+
ums.psz[5] = 0;
167+
168+
IntPtr ptr = (IntPtr)(&ums);
169+
ManagedStruct ms = Marshal.PtrToStructure<ManagedStruct>(ptr);
170+
string actual = ms.str;
171+
172+
UTF8Encoding uTF8Encoding = new UTF8Encoding();
173+
byte [] b = new byte[5];
174+
b[0] = 0xFF;
175+
b[1] = (byte)'a';
176+
b[2] = (byte)'b';
177+
b[3] = (byte)'c';
178+
b[4] = (byte)'d';
179+
string expected = uTF8Encoding.GetString(b);
180+
if (actual != expected)
181+
{
182+
Console.WriteLine("Actual:" + actual + " Length:" + actual.Length);
183+
Console.WriteLine("Expected:" + expected + " Length:" + expected.Length);
184+
throw new Exception("UTF8Encoding.GetString doesn't match with Utf8 String Marshaller result");
185+
}
186+
}
141187
}
142188

143189
// UTF8 string as delegate parameter
@@ -165,6 +211,7 @@ public static void Utf8StringCallback(string nativeString, int index)
165211
}
166212
}
167213

214+
168215
class Test
169216
{
170217
//test strings
@@ -216,6 +263,7 @@ public static int Main(string[] args)
216263
// String.Empty tests
217264
UTF8StringTests.EmptyStringTest();
218265

266+
219267
return 100;
220268
}
221-
}
269+
}

0 commit comments

Comments
 (0)