1- // ===-- Unittests for character_converter utf8->3 ---- ---------------------===//
1+ // ===-- Unittests for character_converter utf8->utf32 ---------------------===//
22//
33// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44// See https://llvm.org/LICENSE.txt for license information.
@@ -30,7 +30,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
3030 LIBC_NAMESPACE::internal::mbstate state;
3131 state.bytes_processed = 0 ;
3232 state.total_bytes = 0 ;
33- const char *ch = " " ; // hex 0xC2, 0x8E
33+ const char ch[ 2 ] = { static_cast < char >( 0xC2 ), static_cast < char >( 0x8E )} ; //
3434
3535 LIBC_NAMESPACE::internal::CharacterConverter char_conv (&state);
3636 char_conv.push (static_cast <char8_t >(ch[0 ]));
@@ -45,7 +45,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
4545 LIBC_NAMESPACE::internal::mbstate state;
4646 state.bytes_processed = 0 ;
4747 state.total_bytes = 0 ;
48- const char *ch = " ∑" ; // hex 0xE2, 0x88, 0x91
48+ const char ch[3 ] = {static_cast <char >(0xE2 ), static_cast <char >(0x88 ),
49+ static_cast <char >(0x91 )}; // ∑
4950
5051 LIBC_NAMESPACE::internal::CharacterConverter char_conv (&state);
5152 char_conv.push (static_cast <char8_t >(ch[0 ]));
@@ -61,7 +62,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
6162 LIBC_NAMESPACE::internal::mbstate state;
6263 state.bytes_processed = 0 ;
6364 state.total_bytes = 0 ;
64- const char *ch = " 🤡" ; // hex 0xF0, 0x9F, 0xA4, 0xA1
65+ const char ch[4 ] = {static_cast <char >(0xF0 ), static_cast <char >(0x9F ),
66+ static_cast <char >(0xA4 ), static_cast <char >(0xA1 )}; // 🤡
6567
6668 LIBC_NAMESPACE::internal::CharacterConverter char_conv (&state);
6769 char_conv.push (static_cast <char8_t >(ch[0 ]));
@@ -90,36 +92,85 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
9092 LIBC_NAMESPACE::internal::mbstate state;
9193 state.bytes_processed = 0 ;
9294 state.total_bytes = 0 ;
93- const char ch[4 ] = {static_cast < char >( 0x80 ), static_cast < char >( 0x00 ),
94- static_cast <char >(0x00 ),
95- static_cast <char >(0x00 )}; // All bytes are invalid
95+ const char ch[4 ] = {
96+ static_cast < char >( 0x80 ), static_cast <char >(0x00 ), static_cast < char >( 0x80 ),
97+ static_cast <char >(0x00 )}; // first, third, and last bytes are invalid
9698
9799 LIBC_NAMESPACE::internal::CharacterConverter char_conv (&state);
98100 int err = char_conv.push (static_cast <char8_t >(ch[0 ]));
99101 ASSERT_EQ (err, -1 );
100102 err = char_conv.push (static_cast <char8_t >(ch[1 ]));
101- ASSERT_EQ (err, -1 );
103+ ASSERT_EQ (err, 0 );
104+ // Prev byte was single byte so trying to read another should error.
102105 err = char_conv.push (static_cast <char8_t >(ch[2 ]));
103106 ASSERT_EQ (err, -1 );
104107 err = char_conv.push (static_cast <char8_t >(ch[3 ]));
105108 ASSERT_EQ (err, -1 );
106109}
107110
108- TEST (LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte ) {
111+ TEST (LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte ) {
109112 LIBC_NAMESPACE::internal::mbstate state;
110113 state.bytes_processed = 0 ;
111114 state.total_bytes = 0 ;
112- const char ch[4 ] = {static_cast <char >(0xF1 ), static_cast <char >(0xC0 ),
115+ const char ch[4 ] = {static_cast <char >(0xF1 ), static_cast <char >(0x80 ),
113116 static_cast <char >(0x80 ),
114- static_cast <char >(0x80 )}; // invalid second byte
117+ static_cast <char >(0xC0 )}; // invalid last byte
115118
116119 LIBC_NAMESPACE::internal::CharacterConverter char_conv (&state);
117120 int err = char_conv.push (static_cast <char8_t >(ch[0 ]));
118121 ASSERT_EQ (err, 0 );
119122 err = char_conv.push (static_cast <char8_t >(ch[1 ]));
123+ ASSERT_EQ (err, 0 );
124+ err = char_conv.push (static_cast <char8_t >(ch[2 ]));
125+ ASSERT_EQ (err, 0 );
126+ err = char_conv.push (static_cast <char8_t >(ch[3 ]));
120127 ASSERT_EQ (err, -1 );
128+ }
129+
130+ TEST (LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
131+ LIBC_NAMESPACE::internal::mbstate state;
132+ state.bytes_processed = 0 ;
133+ state.total_bytes = 0 ;
134+ const char ch[3 ] = {static_cast <char >(0xC2 ), static_cast <char >(0x8E ),
135+ static_cast <char >(0x80 )};
136+
137+ LIBC_NAMESPACE::internal::CharacterConverter char_conv (&state);
138+ int err = char_conv.push (static_cast <char8_t >(ch[0 ]));
139+ ASSERT_EQ (err, 0 );
140+ err = char_conv.push (static_cast <char8_t >(ch[1 ]));
141+ ASSERT_EQ (err, 0 );
142+ // Should produce an error on 3rd byte
143+ err = char_conv.push (static_cast <char8_t >(ch[2 ]));
144+ ASSERT_EQ (err, -1 );
145+
146+ LIBC_NAMESPACE::internal::utf_ret<char32_t > wch = char_conv.pop_utf32 ();
147+ ASSERT_EQ (wch.error , 0 );
148+ // Should still output the correct result.
149+ ASSERT_EQ (static_cast <int >(wch.out ), 142 );
150+ }
151+
152+ TEST (LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
153+ LIBC_NAMESPACE::internal::mbstate state;
154+ state.bytes_processed = 0 ;
155+ state.total_bytes = 0 ;
156+ const char ch[4 ] = {static_cast <char >(0xC2 ), static_cast <char >(0x8E ),
157+ static_cast <char >(0xC7 ), static_cast <char >(0x8C )};
158+
159+ LIBC_NAMESPACE::internal::CharacterConverter char_conv (&state);
160+ int err = char_conv.push (static_cast <char8_t >(ch[0 ]));
161+ ASSERT_EQ (err, 0 );
162+ err = char_conv.push (static_cast <char8_t >(ch[1 ]));
163+ ASSERT_EQ (err, 0 );
164+ LIBC_NAMESPACE::internal::utf_ret<char32_t > wch = char_conv.pop_utf32 ();
165+ ASSERT_EQ (wch.error , 0 );
166+ ASSERT_EQ (static_cast <int >(wch.out ), 142 );
167+
168+ // Second two byte character
121169 err = char_conv.push (static_cast <char8_t >(ch[2 ]));
122170 ASSERT_EQ (err, 0 );
123171 err = char_conv.push (static_cast <char8_t >(ch[3 ]));
124172 ASSERT_EQ (err, 0 );
173+ wch = char_conv.pop_utf32 ();
174+ ASSERT_EQ (wch.error , 0 );
175+ ASSERT_EQ (static_cast <int >(wch.out ), 460 );
125176}
0 commit comments