|
4 | 4 | #include <stddef.h> |
5 | 5 | #include <stdint.h> |
6 | 6 | #include <uchar.h> |
| 7 | +#include <optional> |
7 | 8 | #include "utility/span.h" |
8 | 9 | #include <fmt/format.h> |
9 | 10 | #include "../dab_logging.h" |
@@ -64,45 +65,70 @@ static std::string convert_utf16_to_utf8(tcb::span<const uint8_t> utf16_string) |
64 | 65 | // High surrogates U+D800 - U+DB7F |
65 | 66 | // High private use U+DB80 - U+DBFF |
66 | 67 | // Low surrogates U+DC00 - U+DFFF |
67 | | - // https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates |
68 | | - // A pair of high and low surrogates addresses U+010000-U+100000 according to the equation |
69 | | - // C = 0x10000 + (H-0xD800)*0x0400 + (L-0xDC00) |
70 | 68 |
|
71 | 69 | size_t total_utf16_bytes = utf16_string.size(); |
72 | 70 | if (total_utf16_bytes % 2 != 0) total_utf16_bytes--; // round to 16bits |
| 71 | + |
73 | 72 | // represent utf16 2byte codepoints with utf8 continuation bytes |
74 | | - size_t total_utf8_bytes = 0; |
75 | | - for (size_t i = 0; i < total_utf16_bytes; i+=2) { |
76 | | - const uint16_t c = uint16_t(utf16_string[i]) << 8 | uint16_t(utf16_string[i+1]); // big endian |
77 | | - if (c <= 0x007F) total_utf8_bytes += 1; |
78 | | - else if (c <= 0x07FF) total_utf8_bytes += 2; |
79 | | - else if (c >= 0x2FE0 && c <= 0x2FEF) continue; // ignore gap in BMP |
80 | | - else if (c >= 0xD800 && c <= 0xDFFF) continue; // TODO: handle surrogates |
81 | | - else total_utf8_bytes += 3; |
82 | | - } |
83 | | - std::string utf8_string(total_utf8_bytes, '\0'); |
84 | | - size_t j = 0; |
| 73 | + std::string utf8_string; |
| 74 | + utf8_string.reserve(total_utf16_bytes); |
| 75 | + |
| 76 | + std::optional<uint16_t> high_surrogate = std::nullopt; |
85 | 77 | for (size_t i = 0; i < total_utf16_bytes; i+=2) { |
86 | 78 | const uint16_t c = uint16_t(utf16_string[i]) << 8 | uint16_t(utf16_string[i+1]); // big endian |
| 79 | + |
| 80 | + // https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates |
| 81 | + // A pair of high and low surrogates addresses U+010000-U+100000 according to the equation |
| 82 | + // C = 0x10000 + (H-0xD800)*0x0400 + (L-0xDC00) |
| 83 | + if (high_surrogate != std::nullopt) { |
| 84 | + if (c >= 0xDC00 && c <= 0xDFFF) { |
| 85 | + const uint32_t H = uint32_t(high_surrogate.value()); |
| 86 | + const uint32_t L = uint32_t(c); |
| 87 | + const uint32_t C = 0x10000 + (H-0xD800)*0x0400 + (L-0xDC00); |
| 88 | + // 1111_0xxx, 10xx_xxxx, 10xx_xxxx, 10xx_xxxx |
| 89 | + utf8_string.push_back(0b1111'0000 | uint8_t((C & 0b0001'1100'0000'0000'0000'0000) >> 18)); |
| 90 | + utf8_string.push_back(0b1000'0000 | uint8_t((C & 0b0000'0011'1111'0000'0000'0000) >> 12)); |
| 91 | + utf8_string.push_back(0b1000'0000 | uint8_t((C & 0b0000'0000'0000'1111'1100'0000) >> 6)); |
| 92 | + utf8_string.push_back(0b1000'0000 | uint8_t((C & 0b0000'0000'0000'0000'0011'1111) >> 0)); |
| 93 | + high_surrogate = std::nullopt; |
| 94 | + continue; |
| 95 | + } else if (c >= 0xD800 && c <= 0xDBFF) { |
| 96 | + LOG_ERROR( |
| 97 | + "high surrogate received twice in a row, first={:02x}, second={:02x}", |
| 98 | + high_surrogate.value(), c |
| 99 | + ); |
| 100 | + // override original first high surrogate assuming the previous one was a fluke |
| 101 | + high_surrogate = c; |
| 102 | + continue; |
| 103 | + } else { |
| 104 | + LOG_ERROR( |
| 105 | + "surrogate pair missing low surrogate, high_surrogate={:02x}, bad_low_surrogate={:02x}", |
| 106 | + high_surrogate.value(), c |
| 107 | + ); |
| 108 | + // isolated surrogates should be ignored and codepoint processed as normal |
| 109 | + high_surrogate = std::nullopt; |
| 110 | + // @fallthrough |
| 111 | + } |
| 112 | + } |
| 113 | + |
87 | 114 | if (c <= 0x007F) { |
88 | 115 | // 0xxx_xxxx |
89 | | - utf8_string[j] = uint8_t(c & 0x007F); |
90 | | - j += 1; |
| 116 | + utf8_string.push_back(uint8_t(c & 0x007F)); |
91 | 117 | } else if (c <= 0x07FF) { |
92 | 118 | // 110x_xxxx, 10xx_xxxx |
93 | | - utf8_string[j] = 0b1100'0000 | uint8_t((c & 0b0000'0111'1100'0000) >> 6); |
94 | | - utf8_string[j+1] = 0b1000'0000 | uint8_t((c & 0b0000'0000'0011'1111) >> 0); |
95 | | - j += 2; |
| 119 | + utf8_string.push_back(0b1100'0000 | uint8_t((c & 0b0000'0111'1100'0000) >> 6)); |
| 120 | + utf8_string.push_back(0b1000'0000 | uint8_t((c & 0b0000'0000'0011'1111) >> 0)); |
96 | 121 | } else if (c >= 0x2FE0 && c <= 0x2FEF) { |
97 | 122 | // ignore gap in BMP |
98 | | - } else if (c >= 0xD800 && c <= 0xDFFF) { |
99 | | - // TODO: handle surrogates |
| 123 | + } else if (c >= 0xD800 && c <= 0xDBFF) { |
| 124 | + high_surrogate = c; |
| 125 | + } else if (c >= 0xDC00 && c <= 0xDFFF) { |
| 126 | + LOG_ERROR("got low surrogate first instead of high surrogate {:02x}", c); |
100 | 127 | } else { |
101 | 128 | // 1110_xxxx, 10xx_xxxx, 10xx_xxxx |
102 | | - utf8_string[j] = 0b1110'0000 | uint8_t((c & 0b1111'0000'0000'0000) >> 12); |
103 | | - utf8_string[j+1] = 0b1000'0000 | uint8_t((c & 0b0000'1111'1100'0000) >> 6); |
104 | | - utf8_string[j+2] = 0b1000'0000 | uint8_t((c & 0b0000'0000'0011'1111) >> 0); |
105 | | - j += 3; |
| 129 | + utf8_string.push_back(0b1110'0000 | uint8_t((c & 0b1111'0000'0000'0000) >> 12)); |
| 130 | + utf8_string.push_back(0b1000'0000 | uint8_t((c & 0b0000'1111'1100'0000) >> 6)); |
| 131 | + utf8_string.push_back(0b1000'0000 | uint8_t((c & 0b0000'0000'0011'1111) >> 0)); |
106 | 132 | } |
107 | 133 | } |
108 | 134 | return utf8_string; |
|
0 commit comments