@@ -347,19 +347,33 @@ const std::error_category & __cdecl linux_category()
347347#define H_SURROGATE_END 0xDBFF
348348#define SURROGATE_PAIR_START 0x10000
349349
350+ // Create a dedicated type for characters to avoid the issue
351+ // of different platforms defaulting char to be either signed
352+ // or unsigned.
353+ using UtilCharInternal_t = signed char ;
354+
355+
350356inline size_t count_utf8_to_utf16 (const std::string& s)
351357{
352358 const size_t sSize = s.size ();
353- const char * const sData = s.data ();
359+ auto sData = reinterpret_cast < const UtilCharInternal_t* const >( s.data () );
354360 size_t result{ sSize };
361+
355362 for (size_t index = 0 ; index < sSize ;)
356363 {
357- const char c{ sData [index++] };
358- if ((c & BIT8) == 0 )
364+ if ( sData [index] > 0 )
359365 {
360- continue ;
366+ // use fast inner loop to skip single byte code points (which are
367+ // expected to be the most frequent)
368+ while ((++index < sSize ) && (sData [index] > 0 ))
369+ ;
370+
371+ if (index >= sSize ) break ;
361372 }
362373
374+ // start special handling for multi-byte code points
375+ const UtilCharInternal_t c{ sData [index++] };
376+
363377 if ((c & BIT7) == 0 )
364378 {
365379 throw std::range_error (" UTF-8 string character can never start with 10xxxxxx" );
@@ -371,7 +385,7 @@ inline size_t count_utf8_to_utf16(const std::string& s)
371385 throw std::range_error (" UTF-8 string is missing bytes in character" );
372386 }
373387
374- const char c2{ sData [index++] };
388+ const UtilCharInternal_t c2{ sData [index++] };
375389 if ((c2 & 0xC0 ) != BIT8)
376390 {
377391 throw std::range_error (" UTF-8 continuation byte is missing leading bit mask" );
@@ -387,8 +401,8 @@ inline size_t count_utf8_to_utf16(const std::string& s)
387401 throw std::range_error (" UTF-8 string is missing bytes in character" );
388402 }
389403
390- const char c2{ sData [index++] };
391- const char c3{ sData [index++] };
404+ const UtilCharInternal_t c2{ sData [index++] };
405+ const UtilCharInternal_t c3{ sData [index++] };
392406 if (((c2 | c3) & 0xC0 ) != BIT8)
393407 {
394408 throw std::range_error (" UTF-8 continuation byte is missing leading bit mask" );
@@ -403,9 +417,9 @@ inline size_t count_utf8_to_utf16(const std::string& s)
403417 throw std::range_error (" UTF-8 string is missing bytes in character" );
404418 }
405419
406- const char c2{ sData [index++] };
407- const char c3{ sData [index++] };
408- const char c4{ sData [index++] };
420+ const UtilCharInternal_t c2{ sData [index++] };
421+ const UtilCharInternal_t c3{ sData [index++] };
422+ const UtilCharInternal_t c4{ sData [index++] };
409423 if (((c2 | c3 | c4) & 0xC0 ) != BIT8)
410424 {
411425 throw std::range_error (" UTF-8 continuation byte is missing leading bit mask" );
@@ -427,21 +441,21 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
427441{
428442 // Save repeated heap allocations, use the length of resulting sequence.
429443 const size_t srcSize = s.size ();
430- const std::string::value_type* const srcData = &s[ 0 ] ;
444+ auto srcData = reinterpret_cast < const UtilCharInternal_t* const >(s. data ()) ;
431445 utf16string dest (count_utf8_to_utf16 (s), L' \0 ' );
432446 utf16string::value_type* const destData = &dest[0 ];
433447 size_t destIndex = 0 ;
434448
435449 for (size_t index = 0 ; index < srcSize; ++index)
436450 {
437- std::string::value_type src = srcData[index];
451+ UtilCharInternal_t src = srcData[index];
438452 switch (src & 0xF0 )
439453 {
440454 case 0xF0 : // 4 byte character, 0x10000 to 0x10FFFF
441455 {
442- const char c2{ srcData[++index] };
443- const char c3{ srcData[++index] };
444- const char c4{ srcData[++index] };
456+ const UtilCharInternal_t c2{ srcData[++index] };
457+ const UtilCharInternal_t c3{ srcData[++index] };
458+ const UtilCharInternal_t c4{ srcData[++index] };
445459 uint32_t codePoint = ((src & LOW_3BITS) << 18 ) | ((c2 & LOW_6BITS) << 12 ) | ((c3 & LOW_6BITS) << 6 ) | (c4 & LOW_6BITS);
446460 if (codePoint >= SURROGATE_PAIR_START)
447461 {
@@ -464,20 +478,27 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
464478 break ;
465479 case 0xE0 : // 3 byte character, 0x800 to 0xFFFF
466480 {
467- const char c2{ srcData[++index] };
468- const char c3{ srcData[++index] };
481+ const UtilCharInternal_t c2{ srcData[++index] };
482+ const UtilCharInternal_t c3{ srcData[++index] };
469483 destData[destIndex++] = static_cast <utf16string::value_type>(((src & LOW_4BITS) << 12 ) | ((c2 & LOW_6BITS) << 6 ) | (c3 & LOW_6BITS));
470484 }
471485 break ;
472486 case 0xD0 : // 2 byte character, 0x80 to 0x7FF
473487 case 0xC0 :
474488 {
475- const char c2{ srcData[++index] };
489+ const UtilCharInternal_t c2{ srcData[++index] };
476490 destData[destIndex++] = static_cast <utf16string::value_type>(((src & LOW_5BITS) << 6 ) | (c2 & LOW_6BITS));
477491 }
478492 break ;
479493 default : // single byte character, 0x0 to 0x7F
480- destData[destIndex++] = static_cast <utf16string::value_type>(src);
494+ // try to use a fast inner loop for following single byte characters,
495+ // since they are quite probable
496+ do
497+ {
498+ destData[destIndex++] = static_cast <utf16string::value_type>(srcData[index++]);
499+ } while (index < srcSize && srcData[index] > 0 );
500+ // adjust index since it will be incremented by the for loop
501+ --index;
481502 }
482503 }
483504 return dest;
0 commit comments