Skip to content

Commit e470025

Browse files
committed
Fixing a couple of bugs and improving code readability of UTF8/UTF16 conversions.
1 parent a3b8d82 commit e470025

File tree

2 files changed

+118
-76
lines changed

2 files changed

+118
-76
lines changed

Release/src/utilities/asyncrt_utils.cpp

Lines changed: 61 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,20 @@ const std::error_category & __cdecl linux_category()
252252

253253
}
254254

255-
#define LOWER_6BITS 0x3F
255+
#define LOW_3BITS 0x7
256+
#define LOW_4BITS 0xF
257+
#define LOW_5BITS 0x1F
258+
#define LOW_6BITS 0x3F
259+
#define BIT4 0x8
260+
#define BIT5 0x10
261+
#define BIT6 0x20
262+
#define BIT7 0x40
263+
#define BIT8 0x80
264+
#define L_SURROGATE_START 0xDC00
265+
#define L_SURROGATE_END 0xDFFF
266+
#define H_SURROGATE_START 0xD800
267+
#define H_SURROGATE_END 0xDBFF
268+
#define SURROGATE_PAIR_START 0x10000
256269

257270
utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
258271
{
@@ -265,58 +278,63 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
265278
// of the characters are not just ASCII and collapse.
266279
dest.reserve(static_cast<size_t>(s.size() * .70));
267280

268-
const unsigned char *src = reinterpret_cast<const unsigned char *>(s.c_str());
269-
auto srcRemainingSize = s.size();
270-
while (srcRemainingSize > 0)
281+
for (auto src = s.begin(); src != s.end(); ++src)
271282
{
272-
if (*src < 0x7F) // single byte character, 0x0 to 0x7F
283+
if ((*src & BIT8) == 0) // single byte character, 0x0 to 0x7F
273284
{
274285
dest.push_back(utf16string::value_type(*src));
275286
}
276287
else
277288
{
278289
unsigned char numContBytes = 0;
279-
int32_t codePoint;
280-
if (*src <= 0xDF) // 2 byte character, 0x80 to 0x7FF
290+
uint32_t codePoint;
291+
if ((*src & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
281292
{
282-
codePoint = *src & 0x1F;
293+
if ((*src & BIT8) != 0 && (*src & BIT7) == 0)
294+
{
295+
throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
296+
}
297+
codePoint = *src & LOW_5BITS;
283298
numContBytes = 1;
284299
}
285-
else if (*src <= 0xEF) // 3 byte character, 0x800 to 0xFFFF
300+
else if ((*src & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
286301
{
287-
codePoint = *src & 0xF;
302+
codePoint = *src & LOW_4BITS;
288303
numContBytes = 2;
289304
}
290-
else if (*src <= 0xF7) // 4 byte character, 0x10000 to 0x10FFFF
305+
else if ((*src & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
291306
{
292-
codePoint = *src & 0x7;
307+
codePoint = *src & LOW_3BITS;
293308
numContBytes = 3;
294309
}
295310
else
296311
{
297312
throw std::range_error("UTF-8 string has invalid Unicode code point");
298313
}
299-
srcRemainingSize -= numContBytes;
300-
if (srcRemainingSize <= 0)
301-
{
302-
throw std::range_error("UTF-8 string is missing bytes in character");
303-
}
304314

305315
for (unsigned char i = 0; i < numContBytes; ++i)
306316
{
317+
if (++src == s.end())
318+
{
319+
throw std::range_error("UTF-8 string is missing bytes in character");
320+
}
321+
if ((*src & BIT8) == 0 || (*src & BIT7) != 0)
322+
{
323+
throw std::range_error("UTF-8 continuation byte is missing leading byte");
324+
}
307325
codePoint <<= 6;
308-
codePoint |= *++src & LOWER_6BITS;
326+
codePoint |= *src & LOW_6BITS;
309327
}
310328

311-
if (numContBytes == 3)
329+
if (codePoint >= SURROGATE_PAIR_START)
312330
{
313-
// In UTF-16 U+1000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
331+
// In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
314332
// - 0x10000 is subtracted from the code point
315333
// - high surrogate is 0xD800 added to the top ten bits
316334
// - low surrogate is 0xDC00 added to the low ten bits
317-
codePoint -= 0x10000;
318-
dest.push_back(utf16string::value_type((codePoint >> 10) + 0xD800));
319-
dest.push_back(utf16string::value_type((codePoint & 0x3FF) + 0xDC00));
335+
codePoint -= SURROGATE_PAIR_START;
336+
dest.push_back(utf16string::value_type((codePoint >> 10) | H_SURROGATE_START));
337+
dest.push_back(utf16string::value_type((codePoint & 0x3FF) | L_SURROGATE_START));
320338
}
321339
else
322340
{
@@ -326,9 +344,6 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
326344
dest.push_back(utf16string::value_type(codePoint));
327345
}
328346
}
329-
330-
--srcRemainingSize;
331-
++src;
332347
}
333348
return dest;
334349
#endif
@@ -342,59 +357,56 @@ std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
342357
#else
343358
std::string dest;
344359
dest.reserve(w.size());
345-
const utf16string::value_type *src = w.c_str();
346-
auto srcRemainingSize = w.size();
347-
while (srcRemainingSize > 0)
360+
for (auto src = w.begin(); src != w.end(); ++src)
348361
{
349362
// Check for high surrogate.
350-
if (*src >= 0xD800 && *src <= 0xDBFF)
363+
if (*src >= H_SURROGATE_START && *src <= H_SURROGATE_END)
351364
{
352-
if (--srcRemainingSize == 0)
365+
const auto highSurrogate = *src;
366+
if (++src == w.end())
353367
{
354368
throw std::range_error("UTF-16 string is missing low surrogate");
355369
}
370+
const auto lowSurrogate = *src;
371+
if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
372+
{
373+
throw std::range_error("UTF-16 string has invalid low surrogate");
374+
}
356375

357376
// To get from surrogate pair to Unicode code point:
358377
// - subract 0xD800 from high surrogate, this forms top ten bits
359378
// - subract 0xDC00 from low surrogate, this forms low ten bits
360379
// - add 0x10000
361380
// Leaves a code point in U+10000 to U+10FFFF range.
362-
uint32_t codePoint = *src - 0xD800;
381+
uint32_t codePoint = highSurrogate - H_SURROGATE_START;
363382
codePoint <<= 10;
364-
codePoint += *++src - 0xDC00;
365-
codePoint += 0x10000;
366-
if (*src < 0xDC00 || *src > 0xDFFF)
367-
{
368-
throw std::range_error("UTF-16 string has invalid low surrogate");
369-
}
383+
codePoint |= lowSurrogate - L_SURROGATE_START;
384+
codePoint |= SURROGATE_PAIR_START;
370385

371386
// 4 bytes need using 21 bits
372387
dest.push_back(char(codePoint >> 18) | 0xF0); // leading 3 bits
373-
dest.push_back(((codePoint >> 12) & LOWER_6BITS) | 0x80); // next 6 bits
374-
dest.push_back(((codePoint >> 6) & LOWER_6BITS) | 0x80); // next 6 bits
375-
dest.push_back((codePoint & LOWER_6BITS) | 0x80); // trailing 6 bits
388+
dest.push_back(((codePoint >> 12) & LOW_6BITS) | BIT8); // next 6 bits
389+
dest.push_back(((codePoint >> 6) & LOW_6BITS) | BIT8); // next 6 bits
390+
dest.push_back((codePoint & LOW_6BITS) | BIT8); // trailing 6 bits
376391
}
377-
else if (*src <= 0xFFFF)
392+
else
378393
{
379-
if (*src < 0x7F) // single byte character
394+
if (*src <= 0x7F) // single byte character
380395
{
381396
dest.push_back(static_cast<char>(*src));
382397
}
383398
else if (*src <= 0x7FF) // 2 bytes needed (11 bits used)
384399
{
385400
dest.push_back(char(*src >> 6) | 0xC0); // leading 5 bits
386-
dest.push_back((*src & LOWER_6BITS) | 0x80); // trailing 6 bits
401+
dest.push_back((*src & LOW_6BITS) | BIT8); // trailing 6 bits
387402
}
388403
else // 3 bytes needed (16 bits used)
389404
{
390405
dest.push_back((*src >> 12) | 0xE0); // leading 4 bits
391-
dest.push_back(((*src >> 6) & LOWER_6BITS) | 0x80); // middle 6 bits
392-
dest.push_back((*src & LOWER_6BITS) | 0x80); // trailing 6 bits
406+
dest.push_back(((*src >> 6) & LOW_6BITS) | BIT8); // middle 6 bits
407+
dest.push_back((*src & LOW_6BITS) | BIT8); // trailing 6 bits
393408
}
394409
}
395-
396-
--srcRemainingSize;
397-
++src;
398410
}
399411

400412
return dest;

Release/tests/functional/utils/strings.cpp

Lines changed: 57 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,17 @@ TEST(utf16_to_utf8)
6363

6464
// encodes to single byte character
6565
VERIFY_ARE_EQUAL("ABC987", utility::conversions::utf16_to_utf8(UTF16("ABC987")));
66+
utf16string input;
67+
input.push_back(0x7F); // last ASCII character
68+
auto result = utility::conversions::utf16_to_utf8(input);
69+
VERIFY_ARE_EQUAL(0x7F, result[0]);
6670

6771
// encodes to 2 byte character
68-
utf16string input;
72+
input.clear();
6973
input.push_back(0x80);
7074
input.push_back(0x14D);
7175
input.push_back(0x7FF);
72-
auto result = utility::conversions::utf16_to_utf8(input);
76+
result = utility::conversions::utf16_to_utf8(input);
7377
#if defined(__GLIBCXX__)
7478
VERIFY_ARE_EQUAL(-62, result[0]);
7579
VERIFY_ARE_EQUAL(-128, result[1]);
@@ -139,53 +143,63 @@ TEST(utf8_to_utf16)
139143

140144
// single byte character
141145
VERIFY_ARE_EQUAL(UTF16("ABC123"), utility::conversions::utf8_to_utf16("ABC123"));
142-
143-
// 2 byte character
144146
std::string input;
145-
input.push_back(207u); // 11001111
146-
input.push_back(129u); // 10000001
147-
input.push_back(198u); // 11000110
148-
input.push_back(141u); // 10001101
147+
input.push_back(0x7F); // last ASCII character
149148
auto result = utility::conversions::utf8_to_utf16(input);
149+
VERIFY_ARE_EQUAL(0x7F, result[0]);
150+
151+
// 2 byte character
152+
input.clear();
153+
// U+80
154+
input.push_back(208u); // 11010000
155+
input.push_back(128u); // 10000000
156+
// U+7FF
157+
input.push_back(223u); // 11011111
158+
input.push_back(191u); // 10111111
159+
result = utility::conversions::utf8_to_utf16(input);
150160
#if defined(__GLIBCXX__)
151-
VERIFY_ARE_EQUAL(961, result[0]);
152-
VERIFY_ARE_EQUAL(397, result[1]);
161+
VERIFY_ARE_EQUAL(1024, result[0]);
162+
VERIFY_ARE_EQUAL(2047, result[1]);
153163
#else
154164
VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
155165
#endif
156166

157167
// 3 byte character
158168
input.clear();
159-
input.push_back(230u); // 11100110
160-
input.push_back(141u); // 10001101
161-
input.push_back(157u); // 10011101
162-
input.push_back(231u); // 11100111
163-
input.push_back(143u); // 10001111
164-
input.push_back(156u); // 10011100
169+
// U+800
170+
input.push_back(232u); // 11101000
171+
input.push_back(128u); // 10000000
172+
input.push_back(128u); // 10000000
173+
// U+FFFF
174+
input.push_back(239u); // 11101111
175+
input.push_back(191u); // 10111111
176+
input.push_back(191u); // 10111111
165177
result = utility::conversions::utf8_to_utf16(input);
166178
#if defined(__GLIBCXX__)
167-
VERIFY_ARE_EQUAL(25437, result[0]);
168-
VERIFY_ARE_EQUAL(29660, result[1]);
179+
VERIFY_ARE_EQUAL(32768, result[0]);
180+
VERIFY_ARE_EQUAL(65535, result[1]);
169181
#else
170182
VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
171183
#endif
172184

173185
// 4 byte character
174186
input.clear();
175-
input.push_back(240u); // 11110000
176-
input.push_back(173u); // 10101101
177-
input.push_back(157u); // 10011101
187+
// U+10000
188+
input.push_back(244u); // 11110100
189+
input.push_back(128u); // 10000000
190+
input.push_back(128u); // 10000000
191+
input.push_back(128u); // 10000000
192+
// U+10FFFF
193+
input.push_back(244u); // 11110100
178194
input.push_back(143u); // 10001111
179-
input.push_back(240u); // 11111000
180-
input.push_back(161u); // 10100001
181195
input.push_back(191u); // 10111111
182196
input.push_back(191u); // 10111111
183197
result = utility::conversions::utf8_to_utf16(input);
184198
#if defined(__GLIBCXX__)
185-
VERIFY_ARE_EQUAL(55413, result[0]);
186-
VERIFY_ARE_EQUAL(57167, result[1]);
187-
VERIFY_ARE_EQUAL(55296, result[2]);
188-
VERIFY_ARE_EQUAL(57160, result[3]);
199+
VERIFY_ARE_EQUAL(56256, result[0]);
200+
VERIFY_ARE_EQUAL(56320, result[1]);
201+
VERIFY_ARE_EQUAL(56319, result[2]);
202+
VERIFY_ARE_EQUAL(57343, result[3]);
189203
#else
190204
VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
191205
#endif
@@ -226,6 +240,22 @@ TEST(utf8_to_utf16_errors)
226240
input.push_back(173u); // 10101101
227241
input.push_back(157u); // 10011101
228242
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
243+
244+
// continuation byte missing leading 10xxxxxx
245+
input.clear();
246+
input.push_back(230u); // 11100110
247+
input.push_back(141u); // 00001101
248+
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
249+
input.clear();
250+
input.push_back(230u); // 11100110
251+
input.push_back(141u); // 11001101
252+
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
253+
254+
// invalid for a first character to start with 1xxxxxxx
255+
input.clear();
256+
input.push_back(128u); // 10000000
257+
input.push_back(128u); // 10000000
258+
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
229259
}
230260

231261
TEST(latin1_to_utf16)

0 commit comments

Comments
 (0)