@@ -275,5 +275,233 @@ public void Utf8EncodingTests_TestFullASCIIRange()
275275 byte [ ] reencoded = Encoding . UTF8 . GetBytes ( decoded ) ;
276276 RoundtripUtf8 ( reencoded , input , 127 ) ;
277277 }
278+
279+ // NEW TESTS FOR FIXES
280+
281+ [ TestMethod ]
282+ public void Utf8EncodingTests_TestIncrementalDecodingExactBuffer ( )
283+ {
284+ // Test the fix for exact buffer size (no space for null terminator)
285+ // This was the original issue where iMaxChars=1, outputUTF16_size=1
286+ string testString = "AB" ;
287+ byte [ ] utf8Bytes = Encoding . UTF8 . GetBytes ( testString ) ;
288+
289+ // Decode one character at a time with exact buffer
290+ char [ ] outputChars = new char [ 1 ] ;
291+ int bytesUsed , charsUsed ;
292+ bool completed ;
293+
294+ var decoder = Encoding . UTF8 . GetDecoder ( ) ;
295+ decoder . Convert ( utf8Bytes , 0 , 1 , outputChars , 0 , 1 , false , out bytesUsed , out charsUsed , out completed ) ;
296+
297+ Assert . AreEqual ( 1 , bytesUsed ) ;
298+ Assert . AreEqual ( 1 , charsUsed ) ;
299+ Assert . AreEqual ( 'A' , outputChars [ 0 ] ) ;
300+ }
301+
302+ [ TestMethod ]
303+ public void Utf8EncodingTests_TestInvalidSurrogatePairHandling ( )
304+ {
305+ // Test UTF-16 to UTF-8 conversion with invalid surrogate pairs
306+ // High surrogate (0xD800) followed by a regular character 'A' (0x41)
307+ // The high surrogate should be replaced with U+FFFD and 'A' should be preserved
308+
309+ // Create string with high surrogate followed by 'A'
310+ char [ ] chars = new char [ ] { ( char ) 0xD800 , 'A' , 'B' } ;
311+ string testString = new string ( chars ) ;
312+
313+ byte [ ] encoded = Encoding . UTF8 . GetBytes ( testString ) ;
314+
315+ // Expect: U+FFFD (0xEF 0xBF 0xBD) + 'A' (0x41) + 'B' (0x42)
316+ byte [ ] expected = new byte [ ] { 0xEF , 0xBF , 0xBD , 0x41 , 0x42 } ;
317+ CollectionAssert . AreEqual ( expected , encoded ) ;
318+ }
319+
320+ [ TestMethod ]
321+ public void Utf8EncodingTests_TestInvalidSurrogatePairMiddle ( )
322+ {
323+ // High surrogate followed by another high surrogate
324+ char [ ] chars = new char [ ] { 'A' , ( char ) 0xD800 , ( char ) 0xD801 , 'B' } ;
325+ string testString = new string ( chars ) ;
326+
327+ byte [ ] encoded = Encoding . UTF8 . GetBytes ( testString ) ;
328+
329+ // Expect: 'A' (0x41) + U+FFFD (0xEF 0xBF 0xBD) + U+FFFD (0xEF 0xBF 0xBD) + 'B' (0x42)
330+ byte [ ] expected = new byte [ ] { 0x41 , 0xEF , 0xBF , 0xBD , 0xEF , 0xBF , 0xBD , 0x42 } ;
331+ CollectionAssert . AreEqual ( expected , encoded ) ;
332+ }
333+
334+ [ TestMethod ]
335+ public void Utf8EncodingTests_TestUnpairedLowSurrogate ( )
336+ {
337+ // Low surrogate without preceding high surrogate
338+ char [ ] chars = new char [ ] { 'A' , ( char ) 0xDC00 , 'B' } ;
339+ string testString = new string ( chars ) ;
340+
341+ byte [ ] encoded = Encoding . UTF8 . GetBytes ( testString ) ;
342+
343+ // Expect: 'A' (0x41) + U+FFFD (0xEF 0xBF 0xBD) + 'B' (0x42)
344+ byte [ ] expected = new byte [ ] { 0x41 , 0xEF , 0xBF , 0xBD , 0x42 } ;
345+ CollectionAssert . AreEqual ( expected , encoded ) ;
346+ }
347+
348+ [ TestMethod ]
349+ public void Utf8EncodingTests_TestUnpairedHighSurrogateAtEnd ( )
350+ {
351+ // High surrogate at the end of input
352+ char [ ] chars = new char [ ] { 'A' , 'B' , ( char ) 0xD800 } ;
353+ string testString = new string ( chars ) ;
354+
355+ byte [ ] encoded = Encoding . UTF8 . GetBytes ( testString ) ;
356+
357+ // Expect: 'A' (0x41) + 'B' (0x42) + U+FFFD (0xEF 0xBF 0xBD)
358+ byte [ ] expected = new byte [ ] { 0x41 , 0x42 , 0xEF , 0xBF , 0xBD } ;
359+ CollectionAssert . AreEqual ( expected , encoded ) ;
360+ }
361+
362+ [ TestMethod ]
363+ public void Utf8EncodingTests_TestPartial2ByteSequence ( )
364+ {
365+ // Start of 2-byte sequence without continuation byte
366+ byte [ ] input = new byte [ ] { 0x41 , 0xC2 } ; // 'A' followed by incomplete 2-byte sequence
367+ byte [ ] expected = new byte [ ] { 0x41 , 0xEF , 0xBF , 0xBD } ;
368+ RoundtripUtf8 ( input , expected , 2 ) ;
369+ }
370+
371+ [ TestMethod ]
372+ public void Utf8EncodingTests_TestPartial3ByteSequence ( )
373+ {
374+ // Start of 3-byte sequence with only 1 continuation byte
375+ byte [ ] input = new byte [ ] { 0x41 , 0xE2 , 0x82 } ; // 'A' followed by incomplete 3-byte sequence
376+ byte [ ] expected = new byte [ ] { 0x41 , 0xEF , 0xBF , 0xBD , 0xEF , 0xBF , 0xBD } ;
377+ RoundtripUtf8 ( input , expected , 3 ) ;
378+ }
379+
380+ [ TestMethod ]
381+ public void Utf8EncodingTests_TestPartial4ByteSequence ( )
382+ {
383+ // Start of 4-byte sequence with only 2 continuation bytes
384+ byte [ ] input = new byte [ ] { 0x41 , 0xF0 , 0x9F , 0x98 } ; // 'A' followed by incomplete 4-byte sequence
385+ byte [ ] expected = new byte [ ] { 0x41 , 0xEF , 0xBF , 0xBD , 0xEF , 0xBF , 0xBD , 0xEF , 0xBF , 0xBD } ;
386+ RoundtripUtf8 ( input , expected , 4 ) ;
387+ }
388+
389+ [ TestMethod ]
390+ public void Utf8EncodingTests_TestMixedValidAndInvalidSequences ( )
391+ {
392+ // Mix of valid and invalid sequences
393+ byte [ ] input = new byte [ ]
394+ {
395+ 0x41 , // 'A' - valid ASCII
396+ 0xC2 , 0xA9 , // © - valid 2-byte
397+ 0xE2 , 0x82 , // incomplete 3-byte
398+ 0x42 , // 'B' - valid ASCII
399+ 0xF0 , 0x9F , 0x98 , 0x80 , // 😀 - valid 4-byte
400+ 0xED , 0xA0 , 0x80 // invalid surrogate
401+ } ;
402+
403+ string decoded = Encoding . UTF8 . GetString ( input , 0 , input . Length ) ;
404+ Assert . IsNotNull ( decoded ) ;
405+ Assert . IsTrue ( decoded . Contains ( "A" ) ) ;
406+ Assert . IsTrue ( decoded . Contains ( "©" ) ) ;
407+ Assert . IsTrue ( decoded . Contains ( "B" ) ) ;
408+ }
409+
410+ [ TestMethod ]
411+ public void Utf8EncodingTests_TestValidSurrogatePair ( )
412+ {
413+ // Test proper handling of valid surrogate pairs
414+ // 😀 (U+1F600) should encode to F0 9F 98 80 and decode back correctly
415+ string emoji = "😀" ;
416+ byte [ ] encoded = Encoding . UTF8 . GetBytes ( emoji ) ;
417+
418+ byte [ ] expected = new byte [ ] { 0xF0 , 0x9F , 0x98 , 0x80 } ;
419+ CollectionAssert . AreEqual ( expected , encoded ) ;
420+
421+ string decoded = Encoding . UTF8 . GetString ( encoded , 0 , encoded . Length ) ;
422+ Assert . AreEqual ( emoji , decoded ) ;
423+ }
424+
425+ [ TestMethod ]
426+ public void Utf8EncodingTests_TestMultipleSurrogatePairs ( )
427+ {
428+ // Multiple emoji/surrogate pairs in sequence
429+ string emojis = "😀😁😂" ;
430+ byte [ ] encoded = Encoding . UTF8 . GetBytes ( emojis ) ;
431+ string decoded = Encoding . UTF8 . GetString ( encoded , 0 , encoded . Length ) ;
432+ Assert . AreEqual ( emojis , decoded ) ;
433+ }
434+
435+ [ TestMethod ]
436+ public void Utf8EncodingTests_TestIncrementalDecodingMultiByte ( )
437+ {
438+ // Test incremental decoding of multi-byte sequences
439+ byte [ ] utf8 = new byte [ ] { 0xE2 , 0x82 , 0xAC } ; // €
440+
441+ char [ ] output = new char [ 1 ] ;
442+ int bytesUsed , charsUsed ;
443+ bool completed ;
444+
445+ var decoder = Encoding . UTF8 . GetDecoder ( ) ;
446+ decoder . Convert ( utf8 , 0 , 3 , output , 0 , 1 , false , out bytesUsed , out charsUsed , out completed ) ;
447+
448+ Assert . AreEqual ( 3 , bytesUsed ) ;
449+ Assert . AreEqual ( 1 , charsUsed ) ;
450+ Assert . AreEqual ( '€' , output [ 0 ] ) ;
451+ }
452+
453+ [ TestMethod ]
454+ public void Utf8EncodingTests_TestOverlongEncodingRejection ( )
455+ {
456+ // Ensure overlong encodings are rejected and replaced with U+FFFD
457+ // Overlong encoding of 'A' (should be 0x41, not C1 81)
458+ byte [ ] input = new byte [ ] { 0xC1 , 0x81 } ;
459+ string decoded = Encoding . UTF8 . GetString ( input , 0 , input . Length ) ;
460+ byte [ ] reencoded = Encoding . UTF8 . GetBytes ( decoded ) ;
461+
462+ // Should produce replacement characters
463+ Assert . AreNotEqual ( input . Length , reencoded . Length ) ;
464+ Assert . IsTrue ( decoded . Contains ( "\uFFFD " ) ) ;
465+ }
466+
467+ [ TestMethod ]
468+ public void Utf8EncodingTests_TestSequentialInvalidBytes ( )
469+ {
470+ // Multiple sequential invalid bytes
471+ byte [ ] input = new byte [ ] { 0xFE , 0xFF , 0xFE } ;
472+ string decoded = Encoding . UTF8 . GetString ( input , 0 , input . Length ) ;
473+ byte [ ] reencoded = Encoding . UTF8 . GetBytes ( decoded ) ;
474+
475+ // Each invalid byte should become one replacement character
476+ byte [ ] expected = new byte [ ] { 0xEF , 0xBF , 0xBD , 0xEF , 0xBF , 0xBD , 0xEF , 0xBF , 0xBD } ;
477+ CollectionAssert . AreEqual ( expected , reencoded ) ;
478+ Assert . AreEqual ( 3 , decoded . Length ) ;
479+ }
480+
481+ [ TestMethod ]
482+ public void Utf8EncodingTests_TestBoundaryCodepoints ( )
483+ {
484+ // Test boundary values for different UTF-8 sequence lengths
485+
486+ // U+007F - last 1-byte character
487+ byte [ ] input1 = new byte [ ] { 0x7F } ;
488+ RoundtripUtf8 ( input1 , input1 , 1 ) ;
489+
490+ // U+0080 - first 2-byte character
491+ byte [ ] input2 = new byte [ ] { 0xC2 , 0x80 } ;
492+ RoundtripUtf8 ( input2 , input2 , 1 ) ;
493+
494+ // U+07FF - last 2-byte character
495+ byte [ ] input3 = new byte [ ] { 0xDF , 0xBF } ;
496+ RoundtripUtf8 ( input3 , input3 , 1 ) ;
497+
498+ // U+0800 - first 3-byte character
499+ byte [ ] input4 = new byte [ ] { 0xE0 , 0xA0 , 0x80 } ;
500+ RoundtripUtf8 ( input4 , input4 , 1 ) ;
501+
502+ // U+FFFF - last 3-byte character (excluding surrogates)
503+ byte [ ] input5 = new byte [ ] { 0xEF , 0xBF , 0xBF } ;
504+ RoundtripUtf8 ( input5 , input5 , 1 ) ;
505+ }
278506 }
279507}
0 commit comments