Skip to content

Commit ecae840

Browse files
committed
Added manual UTF-8 to UTF-16 conversion implementation.
1 parent aec3766 commit ecae840

File tree

2 files changed

+168
-20
lines changed

2 files changed

+168
-20
lines changed

Release/src/utilities/asyncrt_utils.cpp

Lines changed: 85 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
// Could use C++ standard library if not __GLIBCXX__,
3434
// For testing purposes we just the handwritten on all platforms.
35-
#if defined(CPPREST_STDLIB_UTF_CONVERSIONS)
35+
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
3636
#include <codecvt>
3737
#endif
3838

@@ -252,25 +252,100 @@ const std::error_category & __cdecl linux_category()
252252

253253
}
254254

255-
utf16string __cdecl conversions::utf8_to_utf16(const std::string &src)
255+
utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
256256
{
257-
#if defined(CPPREST_STDLIB_UTF_CONVERSIONS)
257+
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
258258
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
259259
return conversion.from_bytes(src);
260260
#else
261+
utf16string dest;
262+
// Save repeated heap allocations, use less than source string size assuming some
263+
// of the characters are not just ASCII and collapse.
264+
dest.reserve(static_cast<size_t>(s.size() * .70));
265+
266+
const unsigned char *src = reinterpret_cast<const unsigned char *>(s.c_str());
267+
auto srcRemainingSize = s.size();
268+
const auto leadingBits = 0x3F;
269+
while (srcRemainingSize > 0)
270+
{
271+
if (*src < 0x80) // single byte character, 0x0 to 0x7F
272+
{
273+
dest.push_back(utf16string::value_type(*src));
274+
}
275+
else
276+
{
277+
unsigned char numContBytes = 0;
278+
int32_t codePoint;
279+
if (*src < 0xE0) // 2 byte character, 0x80 to 0x7FF
280+
{
281+
codePoint = *src & 0x1F;
282+
numContBytes = 1;
283+
}
284+
else if (*src < 0xF0) // 3 byte character, 0x800 to 0xFFFF
285+
{
286+
codePoint = *src & 0xF;
287+
numContBytes = 2;
288+
}
289+
else if (*src < 0xF8) // 4 byte character, 0x10000 to 0x10FFFF
290+
{
291+
codePoint = *src & 0x7;
292+
numContBytes = 3;
293+
}
294+
else
295+
{
296+
throw std::invalid_argument("UTF-8 string has invalid Unicode code point");
297+
}
298+
srcRemainingSize -= numContBytes;
299+
if (srcRemainingSize == 0)
300+
{
301+
throw std::invalid_argument("UTF-8 string is missing bytes in character");
302+
}
303+
304+
for (unsigned char i = 0; i < numContBytes; ++i)
305+
{
306+
codePoint <<= 6;
307+
codePoint |= *++src & leadingBits;
308+
}
309+
310+
if (numContBytes == 3)
311+
{
312+
// In UTF-16 U+1000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
313+
// - 0x10000 is subtracted from the code point
314+
// - high surrogate is 0xD800 added to the top ten bits
315+
// - low surrogate is 0xDC00 added to the low ten bits
316+
codePoint -= 0x10000;
317+
dest.push_back(utf16string::value_type((codePoint >> 10) + 0xD800));
318+
dest.push_back(utf16string::value_type((codePoint & 0x3FF) + 0xDC00));
319+
}
320+
else
321+
{
322+
// In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value.
323+
// U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode
324+
// them if encountered.
325+
dest.push_back(utf16string::value_type(codePoint));
326+
}
327+
}
261328

262-
// TODO
329+
--srcRemainingSize;
330+
++src;
331+
}
332+
return dest;
263333
#endif
264334
}
265335

266336
std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
267337
{
268-
#if defined(CPPREST_STDLIB_UTF_CONVERSIONS)
269-
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
270-
return conversion.to_bytes(w);
271-
#else
272-
// TODO
273-
#endif
338+
#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS)
339+
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
340+
return conversion.to_bytes(w);
341+
#else
342+
std::string dest;
343+
dest.reserve(w.size());
344+
345+
346+
347+
return dest;
348+
#endif
274349
}
275350

276351
utf16string __cdecl conversions::usascii_to_utf16(const std::string &s)

Release/tests/functional/utils/strings.cpp

Lines changed: 83 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525

2626
#include "stdafx.h"
2727

28+
#if !defined(__GLIBCXX__)
29+
#include <codecvt>
30+
#endif
31+
2832
#include <locale_guard.h>
2933

3034
using namespace utility;
@@ -44,17 +48,86 @@ TEST(usascii_to_utf16)
4448
VERIFY_ARE_EQUAL((utf16char)str_ascii[i], str_utf16[i]);
4549
}
4650
}
47-
48-
TEST(default_to_utf16)
51+
52+
TEST(utf8_to_utf16)
4953
{
50-
// TODO: find some string that actually uses something unique to the default code page.
51-
std::string str_default("This is a test");
52-
utf16string str_utf16 = utility::conversions::usascii_to_utf16(str_default);
53-
54-
for (size_t i = 0; i < str_default.size(); ++i)
55-
{
56-
VERIFY_ARE_EQUAL((utf16char)str_default[i], str_utf16[i]);
57-
}
54+
#if !defined(__GLIBCXX__)
55+
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
56+
#endif
57+
58+
// single byte character
59+
VERIFY_ARE_EQUAL(_XPLATSTR("ABC123"), utility::conversions::utf8_to_utf16("ABC123"));
60+
61+
// 2 byte character
62+
std::string input;
63+
input.push_back(unsigned char(207)); // 11001111
64+
input.push_back(unsigned char(129)); // 10000001
65+
input.push_back(unsigned char(198)); // 11000110
66+
input.push_back(unsigned char(141)); // 10001101
67+
auto result = utility::conversions::utf8_to_utf16(input);
68+
#if defined(__GLIBCXX__)
69+
VERIFY_ARE_EQUAL(961, result[0]);
70+
VERIFY_ARE_EQUAL(397, result[1]);
71+
#else
72+
VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
73+
#endif
74+
75+
// 3 byte character
76+
input.clear();
77+
input.push_back(unsigned char(230)); // 11100110
78+
input.push_back(unsigned char(141)); // 10001101
79+
input.push_back(unsigned char(157)); // 10011101
80+
input.push_back(unsigned char(231)); // 11100111
81+
input.push_back(unsigned char(143)); // 10001111
82+
input.push_back(unsigned char(156)); // 10011100
83+
result = utility::conversions::utf8_to_utf16(input);
84+
#if defined(__GLIBCXX__)
85+
VERIFY_ARE_EQUAL(25437, result[0]);
86+
VERIFY_ARE_EQUAL(29660, result[1]);
87+
#else
88+
VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
89+
#endif
90+
91+
// 4 byte character
92+
input.clear();
93+
input.push_back(unsigned char(240)); // 11110000
94+
input.push_back(unsigned char(173)); // 10101101
95+
input.push_back(unsigned char(157)); // 10011101
96+
input.push_back(unsigned char(143)); // 10001111
97+
input.push_back(unsigned char(240)); // 11111000
98+
input.push_back(unsigned char(161)); // 10100001
99+
input.push_back(unsigned char(191)); // 10111111
100+
input.push_back(unsigned char(191)); // 10111111
101+
result = utility::conversions::utf8_to_utf16(input);
102+
#if defined(__GLIBCXX__)
103+
VERIFY_ARE_EQUAL(55413, result[0]);
104+
VERIFY_ARE_EQUAL(57167, result[1]);
105+
VERIFY_ARE_EQUAL(55296, result[2]);
106+
VERIFY_ARE_EQUAL(57160, result[3]);
107+
#else
108+
VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
109+
#endif
110+
}
111+
112+
TEST(utf8_to_utf16_errors)
113+
{
114+
// missing second continuation byte
115+
std::string input;
116+
input.push_back(unsigned char(207)); // 11001111
117+
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::invalid_argument);
118+
119+
// missing third continuation byte
120+
input.clear();
121+
input.push_back(unsigned char(230)); // 11100110
122+
input.push_back(unsigned char(141)); // 10001101
123+
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::invalid_argument);
124+
125+
// missing fourth continuation byte
126+
input.clear();
127+
input.push_back(unsigned char(240)); // 11110000
128+
input.push_back(unsigned char(173)); // 10101101
129+
input.push_back(unsigned char(157)); // 10011101
130+
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::invalid_argument);
58131
}
59132

60133
TEST(latin1_to_utf16)

0 commit comments

Comments
 (0)