Skip to content

Commit a3b8d82

Browse files
committed
Adding error case tests for UTF-16 to UTF-8 conversions. Little bit of error handling cleanup as well.
1 parent da4307b commit a3b8d82

File tree

2 files changed

+29
-9
lines changed

2 files changed

+29
-9
lines changed

Release/src/utilities/asyncrt_utils.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -294,12 +294,12 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
294294
}
295295
else
296296
{
297-
throw std::invalid_argument("UTF-8 string has invalid Unicode code point");
297+
throw std::range_error("UTF-8 string has invalid Unicode code point");
298298
}
299299
srcRemainingSize -= numContBytes;
300300
if (srcRemainingSize <= 0)
301301
{
302-
throw std::invalid_argument("UTF-8 string is missing bytes in character");
302+
throw std::range_error("UTF-8 string is missing bytes in character");
303303
}
304304

305305
for (unsigned char i = 0; i < numContBytes; ++i)
@@ -341,18 +341,18 @@ std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
341341
return conversion.to_bytes(w);
342342
#else
343343
std::string dest;
344-
dest.reserve(w.size()); // TODO size
344+
dest.reserve(w.size());
345345
const utf16string::value_type *src = w.c_str();
346346
auto srcRemainingSize = w.size();
347347
while (srcRemainingSize > 0)
348348
{
349+
// Check for high surrogate.
349350
if (*src >= 0xD800 && *src <= 0xDBFF)
350351
{
351352
if (--srcRemainingSize == 0)
352353
{
353-
// TODO error
354+
throw std::range_error("UTF-16 string is missing low surrogate");
354355
}
355-
// Found a high surrogate.
356356

357357
// To get from surrogate pair to Unicode code point:
358358
// - subract 0xD800 from high surrogate, this forms top ten bits
@@ -363,6 +363,10 @@ std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
363363
codePoint <<= 10;
364364
codePoint += *++src - 0xDC00;
365365
codePoint += 0x10000;
366+
if (*src < 0xDC00 || *src > 0xDFFF)
367+
{
368+
throw std::range_error("UTF-16 string has invalid low surrogate");
369+
}
366370

367371
// 4 bytes need using 21 bits
368372
dest.push_back(char(codePoint >> 18) | 0xF0); // leading 3 bits

Release/tests/functional/utils/strings.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -191,32 +191,48 @@ TEST(utf8_to_utf16)
191191
#endif
192192
}
193193

194+
TEST(utf16_to_utf8_errors)
195+
{
196+
VERIFY_ARE_EQUAL("ABC987", utility::conversions::utf16_to_utf8(UTF16("ABC987")));
197+
utf16string input;
198+
199+
// high surrogate with missing low surrogate.
200+
input.push_back(0xD800);
201+
input.push_back(0x0);
202+
VERIFY_THROWS(utility::conversions::utf16_to_utf8(input), std::range_error);
203+
204+
// high surrogate with no more characters
205+
input.clear();
206+
input.push_back(0xD800);
207+
VERIFY_THROWS(utility::conversions::utf16_to_utf8(input), std::range_error);
208+
}
209+
194210
TEST(utf8_to_utf16_errors)
195211
{
196212
// missing second continuation byte
197213
std::string input;
198214
input.push_back(207u); // 11001111
199-
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::invalid_argument);
215+
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
200216

201217
// missing third continuation byte
202218
input.clear();
203219
input.push_back(230u); // 11100110
204220
input.push_back(141u); // 10001101
205-
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::invalid_argument);
221+
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
206222

207223
// missing fourth continuation byte
208224
input.clear();
209225
input.push_back(240u); // 11110000
210226
input.push_back(173u); // 10101101
211227
input.push_back(157u); // 10011101
212-
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::invalid_argument);
228+
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
213229
}
214230

215231
TEST(latin1_to_utf16)
216232
{
217233
// TODO: find some string that actually uses something unique to the Latin1 code page.
218234
std::string str_latin1("This is a test");
219-
utf16string str_utf16 = utility::conversions::usascii_to_utf16(str_latin1);
235+
utf16string str_utf16 = utility::conversions::latin1_to_utf16(str_latin1);
220236

221237
for (size_t i = 0; i < str_latin1.size(); ++i)
222238
{

0 commit comments

Comments
 (0)