Skip to content

Commit da4307b

Browse files
committed
Finished implementing UTF16 to UTF8 conversion, removed Boost.Locale and libiconv dependency on all platforms.
1 parent 36d7286 commit da4307b

File tree

5 files changed

+138
-43
lines changed

5 files changed

+138
-43
lines changed

Release/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ elseif(ANDROID)
3636
set(BOOST_ROOT "${CMAKE_BINARY_DIR}/../Boost-for-Android-x86/build")
3737
set(BOOST_LIBRARYDIR "${CMAKE_BINARY_DIR}/../Boost-for-Android-x86/build/lib")
3838
endif()
39-
find_host_package(Boost 1.55 EXACT REQUIRED COMPONENTS random system thread locale filesystem chrono atomic)
39+
find_host_package(Boost 1.55 EXACT REQUIRED COMPONENTS random system thread filesystem chrono atomic)
4040

4141
set(OPENSSL_FOUND 1)
4242
if(ARM)
@@ -73,7 +73,7 @@ elseif(ANDROID)
7373
set(BUILD_SAMPLES OFF)
7474
option(BUILD_TESTS "Build tests." ON)
7575
elseif(UNIX) # This includes OSX
76-
find_package(Boost REQUIRED COMPONENTS random chrono system thread locale regex filesystem)
76+
find_package(Boost REQUIRED COMPONENTS random chrono system thread regex filesystem)
7777
find_package(Threads REQUIRED)
7878
if(APPLE AND NOT OPENSSL_ROOT_DIR)
7979
# Prefer a homebrew version of OpenSSL over the one in /usr/lib

Release/src/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ target_link_libraries(${Casablanca_LIBRARY}
8282
${Boost_THREAD_LIBRARY}
8383
${Boost_ATOMIC_LIBRARY}
8484
${Boost_CHRONO_LIBRARY}
85-
${Boost_LOCALE_LIBRARY}
8685
${Boost_RANDOM_LIBRARY}
8786
${EXTRALINKS}
8887
${Boost_FRAMEWORK}

Release/src/build/vs14.android/packages.config

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55
<package id="boost_chrono-android" version="1.55.0.0" targetFramework="Native" />
66
<package id="boost_date_time-android" version="1.55.0.0" targetFramework="Native" />
77
<package id="boost_filesystem-android" version="1.55.0.0" targetFramework="Native" />
8-
<package id="boost_locale-android" version="1.55.0.0" targetFramework="Native" />
98
<package id="boost_system-android" version="1.55.0.0" targetFramework="Native" />
109
<package id="boost_thread-android" version="1.55.0.0" targetFramework="Native" />
11-
<package id="libiconv-android" version="1.13.1.0" targetFramework="Native" />
1210
<package id="openssl-android" version="1.0.1" targetFramework="Native" />
1311
</packages>

Release/src/utilities/asyncrt_utils.cpp

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ const std::error_category & __cdecl linux_category()
252252

253253
}
254254

255-
#define UTF8_ // TODO
255+
#define LOWER_6BITS 0x3F
256256

257257
utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
258258
{
@@ -267,7 +267,6 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
267267

268268
const unsigned char *src = reinterpret_cast<const unsigned char *>(s.c_str());
269269
auto srcRemainingSize = s.size();
270-
const auto leadingBits = 0x3F;
271270
while (srcRemainingSize > 0)
272271
{
273272
if (*src < 0x7F) // single byte character, 0x0 to 0x7F
@@ -298,15 +297,15 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
298297
throw std::invalid_argument("UTF-8 string has invalid Unicode code point");
299298
}
300299
srcRemainingSize -= numContBytes;
301-
if (srcRemainingSize == 0)
300+
if (srcRemainingSize <= 0)
302301
{
303302
throw std::invalid_argument("UTF-8 string is missing bytes in character");
304303
}
305304

306305
for (unsigned char i = 0; i < numContBytes; ++i)
307306
{
308307
codePoint <<= 6;
309-
codePoint |= *++src & leadingBits;
308+
codePoint |= *++src & LOWER_6BITS;
310309
}
311310

312311
if (numContBytes == 3)
@@ -343,34 +342,51 @@ std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
343342
#else
344343
std::string dest;
345344
dest.reserve(w.size()); // TODO size
346-
347345
const utf16string::value_type *src = w.c_str();
348346
auto srcRemainingSize = w.size();
349347
while (srcRemainingSize > 0)
350348
{
351349
if (*src >= 0xD800 && *src <= 0xDBFF)
352350
{
351+
if (--srcRemainingSize == 0)
352+
{
353+
// TODO error
354+
}
353355
// Found a high surrogate.
354-
// TODO in the future check to make sure ....
355-
356356

357+
// To get from surrogate pair to Unicode code point:
358+
// - subract 0xD800 from high surrogate, this forms top ten bits
359+
// - subract 0xDC00 from low surrogate, this forms low ten bits
360+
// - add 0x10000
361+
// Leaves a code point in U+10000 to U+10FFFF range.
362+
uint32_t codePoint = *src - 0xD800;
363+
codePoint <<= 10;
364+
codePoint += *++src - 0xDC00;
365+
codePoint += 0x10000;
366+
367+
// 4 bytes need using 21 bits
368+
dest.push_back(char(codePoint >> 18) | 0xF0); // leading 3 bits
369+
dest.push_back(((codePoint >> 12) & LOWER_6BITS) | 0x80); // next 6 bits
370+
dest.push_back(((codePoint >> 6) & LOWER_6BITS) | 0x80); // next 6 bits
371+
dest.push_back((codePoint & LOWER_6BITS) | 0x80); // trailing 6 bits
357372
}
358373
else if (*src <= 0xFFFF)
359374
{
360375
if (*src < 0x7F) // single byte character
361376
{
362377
dest.push_back(static_cast<char>(*src));
363378
}
364-
else if (*src <= 0x7FF) // 2 bytes needed
379+
else if (*src <= 0x7FF) // 2 bytes needed (11 bits used)
365380
{
366-
dest.push_back((*src >> 3) | 0xC0);
367-
dest.push_back((*src << 5) | )
381+
dest.push_back(char(*src >> 6) | 0xC0); // leading 5 bits
382+
dest.push_back((*src & LOWER_6BITS) | 0x80); // trailing 6 bits
368383
}
369-
else // 3 bytes needed
384+
else // 3 bytes needed (16 bits used)
370385
{
371-
386+
dest.push_back((*src >> 12) | 0xE0); // leading 4 bits
387+
dest.push_back(((*src >> 6) & LOWER_6BITS) | 0x80); // middle 6 bits
388+
dest.push_back((*src & LOWER_6BITS) | 0x80); // trailing 6 bits
372389
}
373-
374390
}
375391

376392
--srcRemainingSize;

Release/tests/functional/utils/strings.cpp

Lines changed: 107 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,103 @@ TEST(usascii_to_utf16)
4949
}
5050
}
5151

52+
#ifdef _WIN32
53+
#define UTF16(x) L ## x
54+
#else
55+
#define UTF16(x) u ## x
56+
#endif
57+
58+
TEST(utf16_to_utf8)
59+
{
60+
#if !defined(__GLIBCXX__)
61+
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
62+
#endif
63+
64+
// encodes to single byte character
65+
VERIFY_ARE_EQUAL("ABC987", utility::conversions::utf16_to_utf8(UTF16("ABC987")));
66+
67+
// encodes to 2 byte character
68+
utf16string input;
69+
input.push_back(0x80);
70+
input.push_back(0x14D);
71+
input.push_back(0x7FF);
72+
auto result = utility::conversions::utf16_to_utf8(input);
73+
#if defined(__GLIBCXX__)
74+
VERIFY_ARE_EQUAL(-62, result[0]);
75+
VERIFY_ARE_EQUAL(-128, result[1]);
76+
VERIFY_ARE_EQUAL(-59, result[2]);
77+
VERIFY_ARE_EQUAL(-115, result[3]);
78+
VERIFY_ARE_EQUAL(-33, result[4]);
79+
VERIFY_ARE_EQUAL(-65, result[5]);
80+
#else
81+
VERIFY_ARE_EQUAL(conversion.to_bytes(input), result);
82+
#endif
83+
84+
// encodes to 3 byte character
85+
input.clear();
86+
input.push_back(0x800);
87+
input.push_back(0x14AB);
88+
input.push_back(0xFFFF);
89+
result = utility::conversions::utf16_to_utf8(input);
90+
#if defined(__GLIBCXX__)
91+
VERIFY_ARE_EQUAL(-32, result[0]);
92+
VERIFY_ARE_EQUAL(-96, result[1]);
93+
VERIFY_ARE_EQUAL(-128, result[2]);
94+
VERIFY_ARE_EQUAL(-31, result[3]);
95+
VERIFY_ARE_EQUAL(-110, result[4]);
96+
VERIFY_ARE_EQUAL(-85, result[5]);
97+
VERIFY_ARE_EQUAL(-17, result[6]);
98+
VERIFY_ARE_EQUAL(-65, result[7]);
99+
VERIFY_ARE_EQUAL(-65, result[8]);
100+
#else
101+
VERIFY_ARE_EQUAL(conversion.to_bytes(input), result);
102+
#endif
103+
104+
// surrogate pair - encodes to 4 byte character
105+
input.clear();
106+
// U+10000
107+
input.push_back(0xD800);
108+
input.push_back(0xDC00);
109+
// U+12345
110+
input.push_back(0xD802);
111+
input.push_back(0xDD29);
112+
// U+10FFFF
113+
input.push_back(0xDA3F);
114+
input.push_back(0xDFFF);
115+
result = utility::conversions::utf16_to_utf8(input);
116+
#if defined(__GLIBCXX__)
117+
VERIFY_ARE_EQUAL(-16, result[0]);
118+
VERIFY_ARE_EQUAL(-112, result[1]);
119+
VERIFY_ARE_EQUAL(-128, result[2]);
120+
VERIFY_ARE_EQUAL(-128, result[3]);
121+
VERIFY_ARE_EQUAL(-16, result[4]);
122+
VERIFY_ARE_EQUAL(-112, result[5]);
123+
VERIFY_ARE_EQUAL(-92, result[6]);
124+
VERIFY_ARE_EQUAL(-87, result[7]);
125+
VERIFY_ARE_EQUAL(-14, result[8]);
126+
VERIFY_ARE_EQUAL(-97, result[9]);
127+
VERIFY_ARE_EQUAL(-65, result[10]);
128+
VERIFY_ARE_EQUAL(-65, result[11]);
129+
#else
130+
VERIFY_ARE_EQUAL(conversion.to_bytes(input), result);
131+
#endif
132+
}
133+
52134
TEST(utf8_to_utf16)
53135
{
54136
#if !defined(__GLIBCXX__)
55137
std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
56138
#endif
57139

58140
// single byte character
59-
VERIFY_ARE_EQUAL(_XPLATSTR("ABC123"), utility::conversions::utf8_to_utf16("ABC123"));
141+
VERIFY_ARE_EQUAL(UTF16("ABC123"), utility::conversions::utf8_to_utf16("ABC123"));
60142

61143
// 2 byte character
62144
std::string input;
63-
input.push_back(unsigned char(207)); // 11001111
64-
input.push_back(unsigned char(129)); // 10000001
65-
input.push_back(unsigned char(198)); // 11000110
66-
input.push_back(unsigned char(141)); // 10001101
145+
input.push_back(207u); // 11001111
146+
input.push_back(129u); // 10000001
147+
input.push_back(198u); // 11000110
148+
input.push_back(141u); // 10001101
67149
auto result = utility::conversions::utf8_to_utf16(input);
68150
#if defined(__GLIBCXX__)
69151
VERIFY_ARE_EQUAL(961, result[0]);
@@ -74,12 +156,12 @@ TEST(utf8_to_utf16)
74156

75157
// 3 byte character
76158
input.clear();
77-
input.push_back(unsigned char(230)); // 11100110
78-
input.push_back(unsigned char(141)); // 10001101
79-
input.push_back(unsigned char(157)); // 10011101
80-
input.push_back(unsigned char(231)); // 11100111
81-
input.push_back(unsigned char(143)); // 10001111
82-
input.push_back(unsigned char(156)); // 10011100
159+
input.push_back(230u); // 11100110
160+
input.push_back(141u); // 10001101
161+
input.push_back(157u); // 10011101
162+
input.push_back(231u); // 11100111
163+
input.push_back(143u); // 10001111
164+
input.push_back(156u); // 10011100
83165
result = utility::conversions::utf8_to_utf16(input);
84166
#if defined(__GLIBCXX__)
85167
VERIFY_ARE_EQUAL(25437, result[0]);
@@ -90,14 +172,14 @@ TEST(utf8_to_utf16)
90172

91173
// 4 byte character
92174
input.clear();
93-
input.push_back(unsigned char(240)); // 11110000
94-
input.push_back(unsigned char(173)); // 10101101
95-
input.push_back(unsigned char(157)); // 10011101
96-
input.push_back(unsigned char(143)); // 10001111
97-
input.push_back(unsigned char(240)); // 11111000
98-
input.push_back(unsigned char(161)); // 10100001
99-
input.push_back(unsigned char(191)); // 10111111
100-
input.push_back(unsigned char(191)); // 10111111
175+
input.push_back(240u); // 11110000
176+
input.push_back(173u); // 10101101
177+
input.push_back(157u); // 10011101
178+
input.push_back(143u); // 10001111
179+
input.push_back(240u); // 11111000
180+
input.push_back(161u); // 10100001
181+
input.push_back(191u); // 10111111
182+
input.push_back(191u); // 10111111
101183
result = utility::conversions::utf8_to_utf16(input);
102184
#if defined(__GLIBCXX__)
103185
VERIFY_ARE_EQUAL(55413, result[0]);
@@ -113,20 +195,20 @@ TEST(utf8_to_utf16_errors)
113195
{
114196
// missing second continuation byte
115197
std::string input;
116-
input.push_back(unsigned char(207)); // 11001111
198+
input.push_back(207u); // 11001111
117199
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::invalid_argument);
118200

119201
// missing third continuation byte
120202
input.clear();
121-
input.push_back(unsigned char(230)); // 11100110
122-
input.push_back(unsigned char(141)); // 10001101
203+
input.push_back(230u); // 11100110
204+
input.push_back(141u); // 10001101
123205
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::invalid_argument);
124206

125207
// missing fourth continuation byte
126208
input.clear();
127-
input.push_back(unsigned char(240)); // 11110000
128-
input.push_back(unsigned char(173)); // 10101101
129-
input.push_back(unsigned char(157)); // 10011101
209+
input.push_back(240u); // 11110000
210+
input.push_back(173u); // 10101101
211+
input.push_back(157u); // 10011101
130212
VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::invalid_argument);
131213
}
132214

0 commit comments

Comments
 (0)