@@ -347,19 +347,33 @@ const std::error_category & __cdecl linux_category()
347
347
#define H_SURROGATE_END 0xDBFF
348
348
#define SURROGATE_PAIR_START 0x10000
349
349
350
+ // Create a dedicated type for characters to avoid the issue
351
+ // of different platforms defaulting char to be either signed
352
+ // or unsigned.
353
+ using UtilCharInternal_t = signed char ;
354
+
355
+
350
356
inline size_t count_utf8_to_utf16 (const std::string& s)
351
357
{
352
358
const size_t sSize = s.size ();
353
- const char * const sData = s.data ();
359
+ auto sData = reinterpret_cast < const UtilCharInternal_t* const >( s.data () );
354
360
size_t result{ sSize };
361
+
355
362
for (size_t index = 0 ; index < sSize ;)
356
363
{
357
- const char c{ sData [index++] };
358
- if ((c & BIT8) == 0 )
364
+ if ( sData [index] > 0 )
359
365
{
360
- continue ;
366
+ // use fast inner loop to skip single byte code points (which are
367
+ // expected to be the most frequent)
368
+ while ((++index < sSize ) && (sData [index] > 0 ))
369
+ ;
370
+
371
+ if (index >= sSize ) break ;
361
372
}
362
373
374
+ // start special handling for multi-byte code points
375
+ const UtilCharInternal_t c{ sData [index++] };
376
+
363
377
if ((c & BIT7) == 0 )
364
378
{
365
379
throw std::range_error (" UTF-8 string character can never start with 10xxxxxx" );
@@ -371,7 +385,7 @@ inline size_t count_utf8_to_utf16(const std::string& s)
371
385
throw std::range_error (" UTF-8 string is missing bytes in character" );
372
386
}
373
387
374
- const char c2{ sData [index++] };
388
+ const UtilCharInternal_t c2{ sData [index++] };
375
389
if ((c2 & 0xC0 ) != BIT8)
376
390
{
377
391
throw std::range_error (" UTF-8 continuation byte is missing leading bit mask" );
@@ -387,8 +401,8 @@ inline size_t count_utf8_to_utf16(const std::string& s)
387
401
throw std::range_error (" UTF-8 string is missing bytes in character" );
388
402
}
389
403
390
- const char c2{ sData [index++] };
391
- const char c3{ sData [index++] };
404
+ const UtilCharInternal_t c2{ sData [index++] };
405
+ const UtilCharInternal_t c3{ sData [index++] };
392
406
if (((c2 | c3) & 0xC0 ) != BIT8)
393
407
{
394
408
throw std::range_error (" UTF-8 continuation byte is missing leading bit mask" );
@@ -403,9 +417,9 @@ inline size_t count_utf8_to_utf16(const std::string& s)
403
417
throw std::range_error (" UTF-8 string is missing bytes in character" );
404
418
}
405
419
406
- const char c2{ sData [index++] };
407
- const char c3{ sData [index++] };
408
- const char c4{ sData [index++] };
420
+ const UtilCharInternal_t c2{ sData [index++] };
421
+ const UtilCharInternal_t c3{ sData [index++] };
422
+ const UtilCharInternal_t c4{ sData [index++] };
409
423
if (((c2 | c3 | c4) & 0xC0 ) != BIT8)
410
424
{
411
425
throw std::range_error (" UTF-8 continuation byte is missing leading bit mask" );
@@ -427,21 +441,21 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
427
441
{
428
442
// Save repeated heap allocations, use the length of resulting sequence.
429
443
const size_t srcSize = s.size ();
430
- const std::string::value_type* const srcData = &s[ 0 ] ;
444
+ auto srcData = reinterpret_cast < const UtilCharInternal_t* const >(s. data ()) ;
431
445
utf16string dest (count_utf8_to_utf16 (s), L' \0 ' );
432
446
utf16string::value_type* const destData = &dest[0 ];
433
447
size_t destIndex = 0 ;
434
448
435
449
for (size_t index = 0 ; index < srcSize; ++index)
436
450
{
437
- std::string::value_type src = srcData[index];
451
+ UtilCharInternal_t src = srcData[index];
438
452
switch (src & 0xF0 )
439
453
{
440
454
case 0xF0 : // 4 byte character, 0x10000 to 0x10FFFF
441
455
{
442
- const char c2{ srcData[++index] };
443
- const char c3{ srcData[++index] };
444
- const char c4{ srcData[++index] };
456
+ const UtilCharInternal_t c2{ srcData[++index] };
457
+ const UtilCharInternal_t c3{ srcData[++index] };
458
+ const UtilCharInternal_t c4{ srcData[++index] };
445
459
uint32_t codePoint = ((src & LOW_3BITS) << 18 ) | ((c2 & LOW_6BITS) << 12 ) | ((c3 & LOW_6BITS) << 6 ) | (c4 & LOW_6BITS);
446
460
if (codePoint >= SURROGATE_PAIR_START)
447
461
{
@@ -464,20 +478,27 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
464
478
break ;
465
479
case 0xE0 : // 3 byte character, 0x800 to 0xFFFF
466
480
{
467
- const char c2{ srcData[++index] };
468
- const char c3{ srcData[++index] };
481
+ const UtilCharInternal_t c2{ srcData[++index] };
482
+ const UtilCharInternal_t c3{ srcData[++index] };
469
483
destData[destIndex++] = static_cast <utf16string::value_type>(((src & LOW_4BITS) << 12 ) | ((c2 & LOW_6BITS) << 6 ) | (c3 & LOW_6BITS));
470
484
}
471
485
break ;
472
486
case 0xD0 : // 2 byte character, 0x80 to 0x7FF
473
487
case 0xC0 :
474
488
{
475
- const char c2{ srcData[++index] };
489
+ const UtilCharInternal_t c2{ srcData[++index] };
476
490
destData[destIndex++] = static_cast <utf16string::value_type>(((src & LOW_5BITS) << 6 ) | (c2 & LOW_6BITS));
477
491
}
478
492
break ;
479
493
default : // single byte character, 0x0 to 0x7F
480
- destData[destIndex++] = static_cast <utf16string::value_type>(src);
494
+ // try to use a fast inner loop for following single byte characters,
495
+ // since they are quite probable
496
+ do
497
+ {
498
+ destData[destIndex++] = static_cast <utf16string::value_type>(srcData[index++]);
499
+ } while (index < srcSize && srcData[index] > 0 );
500
+ // adjust index since it will be incremented by the for loop
501
+ --index;
481
502
}
482
503
}
483
504
return dest;
0 commit comments