|
18 | 18 |
|
19 | 19 | #include <cstddef> |
20 | 20 | #include <cstdint> |
| 21 | +#include <limits> |
21 | 22 |
|
22 | 23 | #include "absl/base/config.h" |
23 | 24 |
|
24 | 25 | namespace absl { |
25 | 26 | ABSL_NAMESPACE_BEGIN |
26 | 27 | namespace strings_internal { |
27 | 28 |
|
28 | | -size_t EncodeUTF8Char(char *buffer, char32_t utf8_char) { |
| 29 | +size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) { |
29 | 30 | if (utf8_char <= 0x7F) { |
30 | 31 | *buffer = static_cast<char>(utf8_char); |
31 | 32 | return 1; |
@@ -53,45 +54,93 @@ size_t EncodeUTF8Char(char *buffer, char32_t utf8_char) { |
53 | 54 | } |
54 | 55 | } |
55 | 56 |
|
56 | | -size_t WideToUtf8(wchar_t wc, char *buf, ShiftState &s) { |
57 | | - const auto v = static_cast<uint32_t>(wc); |
58 | | - if (v < 0x80) { |
59 | | - *buf = static_cast<char>(v); |
| 57 | +size_t WideToUtf8(wchar_t wc, char* buf, ShiftState& s) { |
| 58 | + // Reinterpret the output buffer `buf` as `unsigned char*` for subsequent |
| 59 | + // bitwise operations. This ensures well-defined behavior for bit |
| 60 | + // manipulations (avoiding issues with signed `char`) and is safe under C++ |
| 61 | + // aliasing rules, as `unsigned char` can alias any type. |
| 62 | + auto* ubuf = reinterpret_cast<unsigned char*>(buf); |
| 63 | + const uint32_t v = static_cast<uint32_t>(wc); |
| 64 | + constexpr size_t kError = static_cast<size_t>(-1); |
| 65 | + |
| 66 | + if (v <= 0x007F) { |
| 67 | + // 1-byte sequence (U+0000 to U+007F). |
| 68 | + // 0xxxxxxx. |
| 69 | + ubuf[0] = (0b0111'1111 & v); |
| 70 | + s = {}; // Reset surrogate state. |
60 | 71 | return 1; |
61 | | - } else if (v < 0x800) { |
62 | | - *buf++ = static_cast<char>(0xc0 | (v >> 6)); |
63 | | - *buf = static_cast<char>(0x80 | (v & 0x3f)); |
| 72 | + } else if (0x0080 <= v && v <= 0x07FF) { |
| 73 | + // 2-byte sequence (U+0080 to U+07FF). |
| 74 | + // 110xxxxx 10xxxxxx. |
| 75 | + ubuf[0] = 0b1100'0000 | (0b0001'1111 & (v >> 6)); |
| 76 | + ubuf[1] = 0b1000'0000 | (0b0011'1111 & v); |
| 77 | + s = {}; // Reset surrogate state. |
64 | 78 | return 2; |
65 | | - } else if (v < 0xd800 || (v - 0xe000) < 0x2000) { |
66 | | - *buf++ = static_cast<char>(0xe0 | (v >> 12)); |
67 | | - *buf++ = static_cast<char>(0x80 | ((v >> 6) & 0x3f)); |
68 | | - *buf = static_cast<char>(0x80 | (v & 0x3f)); |
| 79 | + } else if ((0x0800 <= v && v <= 0xD7FF) || (0xE000 <= v && v <= 0xFFFF)) { |
| 80 | + // 3-byte sequence (U+0800 to U+D7FF or U+E000 to U+FFFF). |
| 81 | + // Excludes surrogate code points U+D800-U+DFFF. |
| 82 | + // 1110xxxx 10xxxxxx 10xxxxxx. |
| 83 | + ubuf[0] = 0b1110'0000 | (0b0000'1111 & (v >> 12)); |
| 84 | + ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 6)); |
| 85 | + ubuf[2] = 0b1000'0000 | (0b0011'1111 & v); |
| 86 | + s = {}; // Reset surrogate state. |
69 | 87 | return 3; |
70 | | - } else if ((v - 0x10000) < 0x100000) { |
71 | | - *buf++ = static_cast<char>(0xf0 | (v >> 18)); |
72 | | - *buf++ = static_cast<char>(0x80 | ((v >> 12) & 0x3f)); |
73 | | - *buf++ = static_cast<char>(0x80 | ((v >> 6) & 0x3f)); |
74 | | - *buf = static_cast<char>(0x80 | (v & 0x3f)); |
75 | | - return 4; |
76 | | - } else if (v < 0xdc00) { |
77 | | - s.saw_high_surrogate = true; |
78 | | - s.bits = static_cast<uint8_t>(v & 0x3); |
79 | | - const uint8_t high_bits = ((v >> 6) & 0xf) + 1; |
80 | | - *buf++ = static_cast<char>(0xf0 | (high_bits >> 2)); |
81 | | - *buf = |
82 | | - static_cast<char>(0x80 | static_cast<uint8_t>((high_bits & 0x3) << 4) | |
83 | | - static_cast<uint8_t>((v >> 2) & 0xf)); |
84 | | - return 2; |
85 | | - } else if (v < 0xe000 && s.saw_high_surrogate) { |
86 | | - *buf++ = static_cast<char>(0x80 | static_cast<uint8_t>(s.bits << 4) | |
87 | | - static_cast<uint8_t>((v >> 6) & 0xf)); |
88 | | - *buf = static_cast<char>(0x80 | (v & 0x3f)); |
89 | | - s.saw_high_surrogate = false; |
90 | | - s.bits = 0; |
91 | | - return 2; |
92 | | - } else { |
93 | | - return static_cast<size_t>(-1); |
| 88 | + } else if (0xD800 <= v && v <= 0xDBFF) { |
| 89 | + // High Surrogate (U+D800 to U+DBFF). |
| 90 | + // This part forms the first two bytes of an eventual 4-byte UTF-8 sequence. |
| 91 | + const unsigned char high_bits_val = (0b0000'1111 & (v >> 6)) + 1; |
| 92 | + |
| 93 | + // First byte of the 4-byte UTF-8 sequence (11110xxx). |
| 94 | + ubuf[0] = 0b1111'0000 | (0b0000'0111 & (high_bits_val >> 2)); |
| 95 | + // Second byte of the 4-byte UTF-8 sequence (10xxxxxx). |
| 96 | + ubuf[1] = 0b1000'0000 | // |
| 97 | + (0b0011'0000 & (high_bits_val << 4)) | // |
| 98 | + (0b0000'1111 & (v >> 2)); |
| 99 | + // Set state for high surrogate after writing to buffer. |
| 100 | + s = {true, static_cast<unsigned char>(0b0000'0011 & v)}; |
| 101 | + return 2; // Wrote 2 bytes, expecting 2 more from a low surrogate. |
| 102 | + } else if (0xDC00 <= v && v <= 0xDFFF) { |
| 103 | + // Low Surrogate (U+DC00 to U+DFFF). |
| 104 | + // This part forms the last two bytes of a 4-byte UTF-8 sequence, |
| 105 | + // using state from a preceding high surrogate. |
| 106 | + if (!s.saw_high_surrogate) { |
| 107 | + // Error: Isolated low surrogate without a preceding high surrogate. |
| 108 | + // s remains in its current (problematic) state. |
| 109 | + // Caller should handle error. |
| 110 | + return kError; |
| 111 | + } |
| 112 | + |
| 113 | + // Third byte of the 4-byte UTF-8 sequence (10xxxxxx). |
| 114 | + ubuf[0] = 0b1000'0000 | // |
| 115 | + (0b0011'0000 & (s.bits << 4)) | // |
| 116 | + (0b0000'1111 & (v >> 6)); |
| 117 | + // Fourth byte of the 4-byte UTF-8 sequence (10xxxxxx). |
| 118 | + ubuf[1] = 0b1000'0000 | (0b0011'1111 & v); |
| 119 | + |
| 120 | + s = {}; // Reset surrogate state, pair complete. |
| 121 | + return 2; // Wrote 2 more bytes, completing the 4-byte sequence. |
| 122 | + } else if constexpr (0xFFFF < std::numeric_limits<wchar_t>::max()) { |
| 123 | + // Conditionally compile the 4-byte direct conversion branch. |
| 124 | + // This block is compiled only if wchar_t can represent values > 0xFFFF. |
| 125 | + // It's placed after surrogate checks to ensure surrogates are handled by |
| 126 | + // their specific logic. This inner 'if' is the runtime check for the 4-byte |
| 127 | + // range. At this point, v is known not to be in the 1, 2, or 3-byte BMP |
| 128 | + // ranges, nor is it a surrogate code point. |
| 129 | + if (0x10000 <= v && v <= 0x10FFFF) { |
| 130 | + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. |
| 131 | + ubuf[0] = 0b1111'0000 | (0b0000'0111 & (v >> 18)); |
| 132 | + ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 12)); |
| 133 | + ubuf[2] = 0b1000'0000 | (0b0011'1111 & (v >> 6)); |
| 134 | + ubuf[3] = 0b1000'0000 | (0b0011'1111 & v); |
| 135 | + s = {}; // Reset surrogate state. |
| 136 | + return 4; |
| 137 | + } |
94 | 138 | } |
| 139 | + |
| 140 | + // Invalid wchar_t value (e.g., out of Unicode range, or unhandled after all |
| 141 | + // checks). |
| 142 | + s = {}; // Reset surrogate state. |
| 143 | + return kError; |
95 | 144 | } |
96 | 145 |
|
97 | 146 | } // namespace strings_internal |
|
0 commit comments