@@ -252,7 +252,20 @@ const std::error_category & __cdecl linux_category()
252
252
253
253
}
254
254
255
- #define LOWER_6BITS 0x3F
255
+ #define LOW_3BITS 0x7
256
+ #define LOW_4BITS 0xF
257
+ #define LOW_5BITS 0x1F
258
+ #define LOW_6BITS 0x3F
259
+ #define BIT4 0x8
260
+ #define BIT5 0x10
261
+ #define BIT6 0x20
262
+ #define BIT7 0x40
263
+ #define BIT8 0x80
264
+ #define L_SURROGATE_START 0xDC00
265
+ #define L_SURROGATE_END 0xDFFF
266
+ #define H_SURROGATE_START 0xD800
267
+ #define H_SURROGATE_END 0xDBFF
268
+ #define SURROGATE_PAIR_START 0x10000
256
269
257
270
utf16string __cdecl conversions::utf8_to_utf16 (const std::string &s)
258
271
{
@@ -265,58 +278,63 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
265
278
// of the characters are not just ASCII and collapse.
266
279
dest.reserve (static_cast <size_t >(s.size () * .70 ));
267
280
268
- const unsigned char *src = reinterpret_cast <const unsigned char *>(s.c_str ());
269
- auto srcRemainingSize = s.size ();
270
- while (srcRemainingSize > 0 )
281
+ for (auto src = s.begin (); src != s.end (); ++src)
271
282
{
272
- if (*src < 0x7F ) // single byte character, 0x0 to 0x7F
283
+ if (( *src & BIT8) == 0 ) // single byte character, 0x0 to 0x7F
273
284
{
274
285
dest.push_back (utf16string::value_type (*src));
275
286
}
276
287
else
277
288
{
278
289
unsigned char numContBytes = 0 ;
279
- int32_t codePoint;
280
- if (*src <= 0xDF ) // 2 byte character, 0x80 to 0x7FF
290
+ uint32_t codePoint;
291
+ if (( *src & BIT6) == 0 ) // 2 byte character, 0x80 to 0x7FF
281
292
{
282
- codePoint = *src & 0x1F ;
293
+ if ((*src & BIT8) != 0 && (*src & BIT7) == 0 )
294
+ {
295
+ throw std::range_error (" UTF-8 string character can never start with 10xxxxxx" );
296
+ }
297
+ codePoint = *src & LOW_5BITS;
283
298
numContBytes = 1 ;
284
299
}
285
- else if (*src <= 0xEF ) // 3 byte character, 0x800 to 0xFFFF
300
+ else if (( *src & BIT5) == 0 ) // 3 byte character, 0x800 to 0xFFFF
286
301
{
287
- codePoint = *src & 0xF ;
302
+ codePoint = *src & LOW_4BITS ;
288
303
numContBytes = 2 ;
289
304
}
290
- else if (*src <= 0xF7 ) // 4 byte character, 0x10000 to 0x10FFFF
305
+ else if (( *src & BIT4) == 0 ) // 4 byte character, 0x10000 to 0x10FFFF
291
306
{
292
- codePoint = *src & 0x7 ;
307
+ codePoint = *src & LOW_3BITS ;
293
308
numContBytes = 3 ;
294
309
}
295
310
else
296
311
{
297
312
throw std::range_error (" UTF-8 string has invalid Unicode code point" );
298
313
}
299
- srcRemainingSize -= numContBytes;
300
- if (srcRemainingSize <= 0 )
301
- {
302
- throw std::range_error (" UTF-8 string is missing bytes in character" );
303
- }
304
314
305
315
for (unsigned char i = 0 ; i < numContBytes; ++i)
306
316
{
317
+ if (++src == s.end ())
318
+ {
319
+ throw std::range_error (" UTF-8 string is missing bytes in character" );
320
+ }
321
+ if ((*src & BIT8) == 0 || (*src & BIT7) != 0 )
322
+ {
323
+ throw std::range_error (" UTF-8 continuation byte is missing leading byte" );
324
+ }
307
325
codePoint <<= 6 ;
308
- codePoint |= *++ src & LOWER_6BITS ;
326
+ codePoint |= *src & LOW_6BITS ;
309
327
}
310
328
311
- if (numContBytes == 3 )
329
+ if (codePoint >= SURROGATE_PAIR_START )
312
330
{
313
- // In UTF-16 U+1000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
331
+ // In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
314
332
// - 0x10000 is subtracted from the code point
315
333
// - high surrogate is 0xD800 added to the top ten bits
316
334
// - low surrogate is 0xDC00 added to the low ten bits
317
- codePoint -= 0x10000 ;
318
- dest.push_back (utf16string::value_type ((codePoint >> 10 ) + 0xD800 ));
319
- dest.push_back (utf16string::value_type ((codePoint & 0x3FF ) + 0xDC00 ));
335
+ codePoint -= SURROGATE_PAIR_START ;
336
+ dest.push_back (utf16string::value_type ((codePoint >> 10 ) | H_SURROGATE_START ));
337
+ dest.push_back (utf16string::value_type ((codePoint & 0x3FF ) | L_SURROGATE_START ));
320
338
}
321
339
else
322
340
{
@@ -326,9 +344,6 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
326
344
dest.push_back (utf16string::value_type (codePoint));
327
345
}
328
346
}
329
-
330
- --srcRemainingSize;
331
- ++src;
332
347
}
333
348
return dest;
334
349
#endif
@@ -342,59 +357,56 @@ std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
342
357
#else
343
358
std::string dest;
344
359
dest.reserve (w.size ());
345
- const utf16string::value_type *src = w.c_str ();
346
- auto srcRemainingSize = w.size ();
347
- while (srcRemainingSize > 0 )
360
+ for (auto src = w.begin (); src != w.end (); ++src)
348
361
{
349
362
// Check for high surrogate.
350
- if (*src >= 0xD800 && *src <= 0xDBFF )
363
+ if (*src >= H_SURROGATE_START && *src <= H_SURROGATE_END )
351
364
{
352
- if (--srcRemainingSize == 0 )
365
+ const auto highSurrogate = *src;
366
+ if (++src == w.end ())
353
367
{
354
368
throw std::range_error (" UTF-16 string is missing low surrogate" );
355
369
}
370
+ const auto lowSurrogate = *src;
371
+ if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
372
+ {
373
+ throw std::range_error (" UTF-16 string has invalid low surrogate" );
374
+ }
356
375
357
376
// To get from surrogate pair to Unicode code point:
358
377
// - subract 0xD800 from high surrogate, this forms top ten bits
359
378
// - subract 0xDC00 from low surrogate, this forms low ten bits
360
379
// - add 0x10000
361
380
// Leaves a code point in U+10000 to U+10FFFF range.
362
- uint32_t codePoint = *src - 0xD800 ;
381
+ uint32_t codePoint = highSurrogate - H_SURROGATE_START ;
363
382
codePoint <<= 10 ;
364
- codePoint += *++src - 0xDC00 ;
365
- codePoint += 0x10000 ;
366
- if (*src < 0xDC00 || *src > 0xDFFF )
367
- {
368
- throw std::range_error (" UTF-16 string has invalid low surrogate" );
369
- }
383
+ codePoint |= lowSurrogate - L_SURROGATE_START;
384
+ codePoint |= SURROGATE_PAIR_START;
370
385
371
386
// 4 bytes need using 21 bits
372
387
dest.push_back (char (codePoint >> 18 ) | 0xF0 ); // leading 3 bits
373
- dest.push_back (((codePoint >> 12 ) & LOWER_6BITS ) | 0x80 ); // next 6 bits
374
- dest.push_back (((codePoint >> 6 ) & LOWER_6BITS ) | 0x80 ); // next 6 bits
375
- dest.push_back ((codePoint & LOWER_6BITS ) | 0x80 ); // trailing 6 bits
388
+ dest.push_back (((codePoint >> 12 ) & LOW_6BITS ) | BIT8 ); // next 6 bits
389
+ dest.push_back (((codePoint >> 6 ) & LOW_6BITS ) | BIT8 ); // next 6 bits
390
+ dest.push_back ((codePoint & LOW_6BITS ) | BIT8 ); // trailing 6 bits
376
391
}
377
- else if (*src <= 0xFFFF )
392
+ else
378
393
{
379
- if (*src < 0x7F ) // single byte character
394
+ if (*src <= 0x7F ) // single byte character
380
395
{
381
396
dest.push_back (static_cast <char >(*src));
382
397
}
383
398
else if (*src <= 0x7FF ) // 2 bytes needed (11 bits used)
384
399
{
385
400
dest.push_back (char (*src >> 6 ) | 0xC0 ); // leading 5 bits
386
- dest.push_back ((*src & LOWER_6BITS ) | 0x80 ); // trailing 6 bits
401
+ dest.push_back ((*src & LOW_6BITS ) | BIT8 ); // trailing 6 bits
387
402
}
388
403
else // 3 bytes needed (16 bits used)
389
404
{
390
405
dest.push_back ((*src >> 12 ) | 0xE0 ); // leading 4 bits
391
- dest.push_back (((*src >> 6 ) & LOWER_6BITS ) | 0x80 ); // middle 6 bits
392
- dest.push_back ((*src & LOWER_6BITS ) | 0x80 ); // trailing 6 bits
406
+ dest.push_back (((*src >> 6 ) & LOW_6BITS ) | BIT8 ); // middle 6 bits
407
+ dest.push_back ((*src & LOW_6BITS ) | BIT8 ); // trailing 6 bits
393
408
}
394
409
}
395
-
396
- --srcRemainingSize;
397
- ++src;
398
410
}
399
411
400
412
return dest;
0 commit comments