Skip to content

Commit b3a7141

Browse files
chris0x44BillyONeal
authored andcommitted
Improve utf8_to_utf16 speed for common path (#892)
* Improve utf8_to_utf16 speed for common path Conversion from UTF 8 to UTF 16 will consist mostly of single byte code points (e.g. parsing json bodies). This allows running single byte conversion in a tight loop that is only interrupted if multi byte handling becomes necessary. Measurements for a very long string showed ~30% speed improvement * Use UtilCharInternal_t as character type to avoid issues with platform dependent definition of char
1 parent f4c08f0 commit b3a7141

File tree

2 files changed

+65
-19
lines changed

2 files changed

+65
-19
lines changed

Release/src/utilities/asyncrt_utils.cpp

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -347,19 +347,33 @@ const std::error_category & __cdecl linux_category()
347347
#define H_SURROGATE_END 0xDBFF
348348
#define SURROGATE_PAIR_START 0x10000
349349

350+
// Create a dedicated type for characters to avoid the issue
351+
// of different platforms defaulting char to be either signed
352+
// or unsigned.
353+
using UtilCharInternal_t = signed char;
354+
355+
350356
inline size_t count_utf8_to_utf16(const std::string& s)
351357
{
352358
const size_t sSize = s.size();
353-
const char* const sData = s.data();
359+
auto sData = reinterpret_cast<const UtilCharInternal_t* const>(s.data());
354360
size_t result{ sSize };
361+
355362
for (size_t index = 0; index < sSize;)
356363
{
357-
const char c{ sData[index++] };
358-
if ((c & BIT8) == 0)
364+
if( sData[index] > 0 )
359365
{
360-
continue;
366+
// use fast inner loop to skip single byte code points (which are
367+
// expected to be the most frequent)
368+
while ((++index < sSize) && (sData[index] > 0))
369+
;
370+
371+
if (index >= sSize) break;
361372
}
362373

374+
// start special handling for multi-byte code points
375+
const UtilCharInternal_t c{ sData[index++] };
376+
363377
if ((c & BIT7) == 0)
364378
{
365379
throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
@@ -371,7 +385,7 @@ inline size_t count_utf8_to_utf16(const std::string& s)
371385
throw std::range_error("UTF-8 string is missing bytes in character");
372386
}
373387

374-
const char c2{ sData[index++] };
388+
const UtilCharInternal_t c2{ sData[index++] };
375389
if ((c2 & 0xC0) != BIT8)
376390
{
377391
throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
@@ -387,8 +401,8 @@ inline size_t count_utf8_to_utf16(const std::string& s)
387401
throw std::range_error("UTF-8 string is missing bytes in character");
388402
}
389403

390-
const char c2{ sData[index++] };
391-
const char c3{ sData[index++] };
404+
const UtilCharInternal_t c2{ sData[index++] };
405+
const UtilCharInternal_t c3{ sData[index++] };
392406
if (((c2 | c3) & 0xC0) != BIT8)
393407
{
394408
throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
@@ -403,9 +417,9 @@ inline size_t count_utf8_to_utf16(const std::string& s)
403417
throw std::range_error("UTF-8 string is missing bytes in character");
404418
}
405419

406-
const char c2{ sData[index++] };
407-
const char c3{ sData[index++] };
408-
const char c4{ sData[index++] };
420+
const UtilCharInternal_t c2{ sData[index++] };
421+
const UtilCharInternal_t c3{ sData[index++] };
422+
const UtilCharInternal_t c4{ sData[index++] };
409423
if (((c2 | c3 | c4) & 0xC0) != BIT8)
410424
{
411425
throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
@@ -427,21 +441,21 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
427441
{
428442
// Save repeated heap allocations, use the length of resulting sequence.
429443
const size_t srcSize = s.size();
430-
const std::string::value_type* const srcData = &s[0];
444+
auto srcData = reinterpret_cast<const UtilCharInternal_t* const>(s.data());
431445
utf16string dest(count_utf8_to_utf16(s), L'\0');
432446
utf16string::value_type* const destData = &dest[0];
433447
size_t destIndex = 0;
434448

435449
for (size_t index = 0; index < srcSize; ++index)
436450
{
437-
std::string::value_type src = srcData[index];
451+
UtilCharInternal_t src = srcData[index];
438452
switch (src & 0xF0)
439453
{
440454
case 0xF0: // 4 byte character, 0x10000 to 0x10FFFF
441455
{
442-
const char c2{ srcData[++index] };
443-
const char c3{ srcData[++index] };
444-
const char c4{ srcData[++index] };
456+
const UtilCharInternal_t c2{ srcData[++index] };
457+
const UtilCharInternal_t c3{ srcData[++index] };
458+
const UtilCharInternal_t c4{ srcData[++index] };
445459
uint32_t codePoint = ((src & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
446460
if (codePoint >= SURROGATE_PAIR_START)
447461
{
@@ -464,20 +478,27 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
464478
break;
465479
case 0xE0: // 3 byte character, 0x800 to 0xFFFF
466480
{
467-
const char c2{ srcData[++index] };
468-
const char c3{ srcData[++index] };
481+
const UtilCharInternal_t c2{ srcData[++index] };
482+
const UtilCharInternal_t c3{ srcData[++index] };
469483
destData[destIndex++] = static_cast<utf16string::value_type>(((src & LOW_4BITS) << 12) | ((c2 & LOW_6BITS) << 6) | (c3 & LOW_6BITS));
470484
}
471485
break;
472486
case 0xD0: // 2 byte character, 0x80 to 0x7FF
473487
case 0xC0:
474488
{
475-
const char c2{ srcData[++index] };
489+
const UtilCharInternal_t c2{ srcData[++index] };
476490
destData[destIndex++] = static_cast<utf16string::value_type>(((src & LOW_5BITS) << 6) | (c2 & LOW_6BITS));
477491
}
478492
break;
479493
default: // single byte character, 0x0 to 0x7F
480-
destData[destIndex++] = static_cast<utf16string::value_type>(src);
494+
// try to use a fast inner loop for following single byte characters,
495+
// since they are quite probable
496+
do
497+
{
498+
destData[destIndex++] = static_cast<utf16string::value_type>(srcData[index++]);
499+
} while (index < srcSize && srcData[index] > 0);
500+
// adjust index since it will be incremented by the for loop
501+
--index;
481502
}
482503
}
483504
return dest;

Release/tests/functional/utils/strings.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,31 @@ TEST(utf8_to_utf16)
205205
#else
206206
VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
207207
#endif
208+
209+
210+
// 1 byte character followed by 4 byte character
211+
input.clear();
212+
input.push_back( 51u); // 00110011
213+
// U+10000
214+
input.push_back(244u); // 11110100
215+
input.push_back(128u); // 10000000
216+
input.push_back(128u); // 10000000
217+
input.push_back(128u); // 10000000
218+
// U+10FFFF
219+
input.push_back(244u); // 11110100
220+
input.push_back(143u); // 10001111
221+
input.push_back(191u); // 10111111
222+
input.push_back(191u); // 10111111
223+
result = utility::conversions::utf8_to_utf16(input);
224+
#if defined(__GLIBCXX__)
225+
VERIFY_ARE_EQUAL(51, result[0]);
226+
VERIFY_ARE_EQUAL(56256, result[1]);
227+
VERIFY_ARE_EQUAL(56320, result[2]);
228+
VERIFY_ARE_EQUAL(56319, result[3]);
229+
VERIFY_ARE_EQUAL(57343, result[4]);
230+
#else
231+
VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
232+
#endif
208233
}
209234

210235
TEST(utf16_to_utf8_errors)

0 commit comments

Comments
 (0)