|
30 | 30 | #include <boost/date_time/posix_time/posix_time_io.hpp>
|
31 | 31 | #endif
|
32 | 32 |
|
33 |
| -#if defined(__GLIBCXX__) |
34 |
| -#include "boost/locale.hpp" |
35 |
| -#else |
36 |
| -// Not supported on libstdc++ |
| 33 | +// Could use C++ standard library if not __GLIBCXX__, |
| 34 | +// For testing purposes we just the handwritten on all platforms. |
| 35 | +#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS) |
37 | 36 | #include <codecvt>
|
38 | 37 | #endif
|
39 | 38 |
|
@@ -253,24 +252,166 @@ const std::error_category & __cdecl linux_category()
|
253 | 252 |
|
254 | 253 | }
|
255 | 254 |
|
| 255 | +#define LOW_3BITS 0x7 |
| 256 | +#define LOW_4BITS 0xF |
| 257 | +#define LOW_5BITS 0x1F |
| 258 | +#define LOW_6BITS 0x3F |
| 259 | +#define BIT4 0x8 |
| 260 | +#define BIT5 0x10 |
| 261 | +#define BIT6 0x20 |
| 262 | +#define BIT7 0x40 |
| 263 | +#define BIT8 0x80 |
| 264 | +#define L_SURROGATE_START 0xDC00 |
| 265 | +#define L_SURROGATE_END 0xDFFF |
| 266 | +#define H_SURROGATE_START 0xD800 |
| 267 | +#define H_SURROGATE_END 0xDBFF |
| 268 | +#define SURROGATE_PAIR_START 0x10000 |
| 269 | + |
256 | 270 | utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
|
257 | 271 | {
|
258 |
| -#if defined(__GLIBCXX__) |
259 |
| - return boost::locale::conv::utf_to_utf<utf16char>(s, boost::locale::conv::stop); |
260 |
| -#else |
| 272 | +#if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS) |
261 | 273 | std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion;
|
262 |
| - return conversion.from_bytes(s); |
| 274 | + return conversion.from_bytes(src); |
| 275 | +#else |
| 276 | + utf16string dest; |
| 277 | + // Save repeated heap allocations, use less than source string size assuming some |
| 278 | + // of the characters are not just ASCII and collapse. |
| 279 | + dest.reserve(static_cast<size_t>(static_cast<double>(s.size()) * .70)); |
| 280 | + |
| 281 | + for (auto src = s.begin(); src != s.end(); ++src) |
| 282 | + { |
| 283 | + if ((*src & BIT8) == 0) // single byte character, 0x0 to 0x7F |
| 284 | + { |
| 285 | + dest.push_back(utf16string::value_type(*src)); |
| 286 | + } |
| 287 | + else |
| 288 | + { |
| 289 | + if ((*src & BIT8) != 0 && (*src & BIT7) == 0) |
| 290 | + { |
| 291 | + throw std::range_error("UTF-8 string character can never start with 10xxxxxx"); |
| 292 | + } |
| 293 | + |
| 294 | + unsigned char numContBytes = 0; |
| 295 | + uint32_t codePoint; |
| 296 | + if ((*src & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF |
| 297 | + { |
| 298 | + codePoint = *src & LOW_5BITS; |
| 299 | + numContBytes = 1; |
| 300 | + } |
| 301 | + else if ((*src & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF |
| 302 | + { |
| 303 | + codePoint = *src & LOW_4BITS; |
| 304 | + numContBytes = 2; |
| 305 | + } |
| 306 | + else if ((*src & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF |
| 307 | + { |
| 308 | + codePoint = *src & LOW_3BITS; |
| 309 | + numContBytes = 3; |
| 310 | + } |
| 311 | + else |
| 312 | + { |
| 313 | + throw std::range_error("UTF-8 string has invalid Unicode code point"); |
| 314 | + } |
| 315 | + |
| 316 | + for (unsigned char i = 0; i < numContBytes; ++i) |
| 317 | + { |
| 318 | + if (++src == s.end()) |
| 319 | + { |
| 320 | + throw std::range_error("UTF-8 string is missing bytes in character"); |
| 321 | + } |
| 322 | + if ((*src & BIT8) == 0 || (*src & BIT7) != 0) |
| 323 | + { |
| 324 | + throw std::range_error("UTF-8 continuation byte is missing leading byte"); |
| 325 | + } |
| 326 | + codePoint <<= 6; |
| 327 | + codePoint |= *src & LOW_6BITS; |
| 328 | + } |
| 329 | + |
| 330 | + if (codePoint >= SURROGATE_PAIR_START) |
| 331 | + { |
| 332 | + // In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs. |
| 333 | + // - 0x10000 is subtracted from the code point |
| 334 | + // - high surrogate is 0xD800 added to the top ten bits |
| 335 | + // - low surrogate is 0xDC00 added to the low ten bits |
| 336 | + codePoint -= SURROGATE_PAIR_START; |
| 337 | + dest.push_back(utf16string::value_type((codePoint >> 10) | H_SURROGATE_START)); |
| 338 | + dest.push_back(utf16string::value_type((codePoint & 0x3FF) | L_SURROGATE_START)); |
| 339 | + } |
| 340 | + else |
| 341 | + { |
| 342 | + // In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point value. |
| 343 | + // U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present but will encode |
| 344 | + // them if encountered. |
| 345 | + dest.push_back(utf16string::value_type(codePoint)); |
| 346 | + } |
| 347 | + } |
| 348 | + } |
| 349 | + return dest; |
263 | 350 | #endif
|
264 | 351 | }
|
265 | 352 |
|
266 | 353 | std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
|
267 | 354 | {
|
268 |
| -#if defined(__GLIBCXX__) |
269 |
| - return boost::locale::conv::utf_to_utf<char>(w, boost::locale::conv::stop); |
270 |
| -#else |
271 |
| - std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion; |
272 |
| - return conversion.to_bytes(w); |
273 |
| -#endif |
| 355 | + #if defined(CPPREST_STDLIB_UNICODE_CONVERSIONS) |
| 356 | + std::wstring_convert<std::codecvt_utf8_utf16<utf16char>, utf16char> conversion; |
| 357 | + return conversion.to_bytes(w); |
| 358 | + #else |
| 359 | + std::string dest; |
| 360 | + dest.reserve(w.size()); |
| 361 | + for (auto src = w.begin(); src != w.end(); ++src) |
| 362 | + { |
| 363 | + // Check for high surrogate. |
| 364 | + if (*src >= H_SURROGATE_START && *src <= H_SURROGATE_END) |
| 365 | + { |
| 366 | + const auto highSurrogate = *src++; |
| 367 | + if (src == w.end()) |
| 368 | + { |
| 369 | + throw std::range_error("UTF-16 string is missing low surrogate"); |
| 370 | + } |
| 371 | + const auto lowSurrogate = *src; |
| 372 | + if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END) |
| 373 | + { |
| 374 | + throw std::range_error("UTF-16 string has invalid low surrogate"); |
| 375 | + } |
| 376 | + |
| 377 | + // To get from surrogate pair to Unicode code point: |
| 378 | + // - subract 0xD800 from high surrogate, this forms top ten bits |
| 379 | + // - subract 0xDC00 from low surrogate, this forms low ten bits |
| 380 | + // - add 0x10000 |
| 381 | + // Leaves a code point in U+10000 to U+10FFFF range. |
| 382 | + uint32_t codePoint = highSurrogate - H_SURROGATE_START; |
| 383 | + codePoint <<= 10; |
| 384 | + codePoint |= lowSurrogate - L_SURROGATE_START; |
| 385 | + codePoint |= SURROGATE_PAIR_START; |
| 386 | + |
| 387 | + // 4 bytes need using 21 bits |
| 388 | + dest.push_back(char((codePoint >> 18) | 0xF0)); // leading 3 bits |
| 389 | + dest.push_back(char(((codePoint >> 12) & LOW_6BITS) | BIT8)); // next 6 bits |
| 390 | + dest.push_back(char(((codePoint >> 6) & LOW_6BITS) | BIT8)); // next 6 bits |
| 391 | + dest.push_back(char((codePoint & LOW_6BITS) | BIT8)); // trailing 6 bits |
| 392 | + } |
| 393 | + else |
| 394 | + { |
| 395 | + if (*src <= 0x7F) // single byte character |
| 396 | + { |
| 397 | + dest.push_back(static_cast<char>(*src)); |
| 398 | + } |
| 399 | + else if (*src <= 0x7FF) // 2 bytes needed (11 bits used) |
| 400 | + { |
| 401 | + dest.push_back(char((*src >> 6) | 0xC0)); // leading 5 bits |
| 402 | + dest.push_back(char((*src & LOW_6BITS) | BIT8)); // trailing 6 bits |
| 403 | + } |
| 404 | + else // 3 bytes needed (16 bits used) |
| 405 | + { |
| 406 | + dest.push_back(char((*src >> 12) | 0xE0)); // leading 4 bits |
| 407 | + dest.push_back(char(((*src >> 6) & LOW_6BITS) | BIT8)); // middle 6 bits |
| 408 | + dest.push_back(char((*src & LOW_6BITS) | BIT8)); // trailing 6 bits |
| 409 | + } |
| 410 | + } |
| 411 | + } |
| 412 | + |
| 413 | + return dest; |
| 414 | + #endif |
274 | 415 | }
|
275 | 416 |
|
276 | 417 | utf16string __cdecl conversions::usascii_to_utf16(const std::string &s)
|
|
0 commit comments