Skip to content

Commit 83e249f

Browse files
OmerMorcopybara-github
authored andcommitted
Rewrite WideToUtf8 for improved readability.
This is supposed to be a zero-diff change. PiperOrigin-RevId: 756859112 Change-Id: Ia81a84bc5d1e6f2a1299ca0ff5dbcec48583ab76
1 parent 3eb2041 commit 83e249f

File tree

3 files changed

+103
-41
lines changed

3 files changed

+103
-41
lines changed

absl/strings/internal/utf8.cc

Lines changed: 85 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,15 @@
1818

1919
#include <cstddef>
2020
#include <cstdint>
21+
#include <limits>
2122

2223
#include "absl/base/config.h"
2324

2425
namespace absl {
2526
ABSL_NAMESPACE_BEGIN
2627
namespace strings_internal {
2728

28-
size_t EncodeUTF8Char(char *buffer, char32_t utf8_char) {
29+
size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) {
2930
if (utf8_char <= 0x7F) {
3031
*buffer = static_cast<char>(utf8_char);
3132
return 1;
@@ -53,45 +54,93 @@ size_t EncodeUTF8Char(char *buffer, char32_t utf8_char) {
5354
}
5455
}
5556

56-
size_t WideToUtf8(wchar_t wc, char *buf, ShiftState &s) {
57-
const auto v = static_cast<uint32_t>(wc);
58-
if (v < 0x80) {
59-
*buf = static_cast<char>(v);
57+
size_t WideToUtf8(wchar_t wc, char* buf, ShiftState& s) {
58+
// Reinterpret the output buffer `buf` as `unsigned char*` for subsequent
59+
// bitwise operations. This ensures well-defined behavior for bit
60+
// manipulations (avoiding issues with signed `char`) and is safe under C++
61+
// aliasing rules, as `unsigned char` can alias any type.
62+
auto* ubuf = reinterpret_cast<unsigned char*>(buf);
63+
const uint32_t v = static_cast<uint32_t>(wc);
64+
constexpr size_t kError = static_cast<size_t>(-1);
65+
66+
if (v <= 0x007F) {
67+
// 1-byte sequence (U+0000 to U+007F).
68+
// 0xxxxxxx.
69+
ubuf[0] = (0b0111'1111 & v);
70+
s = {}; // Reset surrogate state.
6071
return 1;
61-
} else if (v < 0x800) {
62-
*buf++ = static_cast<char>(0xc0 | (v >> 6));
63-
*buf = static_cast<char>(0x80 | (v & 0x3f));
72+
} else if (0x0080 <= v && v <= 0x07FF) {
73+
// 2-byte sequence (U+0080 to U+07FF).
74+
// 110xxxxx 10xxxxxx.
75+
ubuf[0] = 0b1100'0000 | (0b0001'1111 & (v >> 6));
76+
ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);
77+
s = {}; // Reset surrogate state.
6478
return 2;
65-
} else if (v < 0xd800 || (v - 0xe000) < 0x2000) {
66-
*buf++ = static_cast<char>(0xe0 | (v >> 12));
67-
*buf++ = static_cast<char>(0x80 | ((v >> 6) & 0x3f));
68-
*buf = static_cast<char>(0x80 | (v & 0x3f));
79+
} else if ((0x0800 <= v && v <= 0xD7FF) || (0xE000 <= v && v <= 0xFFFF)) {
80+
// 3-byte sequence (U+0800 to U+D7FF or U+E000 to U+FFFF).
81+
// Excludes surrogate code points U+D800-U+DFFF.
82+
// 1110xxxx 10xxxxxx 10xxxxxx.
83+
ubuf[0] = 0b1110'0000 | (0b0000'1111 & (v >> 12));
84+
ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 6));
85+
ubuf[2] = 0b1000'0000 | (0b0011'1111 & v);
86+
s = {}; // Reset surrogate state.
6987
return 3;
70-
} else if ((v - 0x10000) < 0x100000) {
71-
*buf++ = static_cast<char>(0xf0 | (v >> 18));
72-
*buf++ = static_cast<char>(0x80 | ((v >> 12) & 0x3f));
73-
*buf++ = static_cast<char>(0x80 | ((v >> 6) & 0x3f));
74-
*buf = static_cast<char>(0x80 | (v & 0x3f));
75-
return 4;
76-
} else if (v < 0xdc00) {
77-
s.saw_high_surrogate = true;
78-
s.bits = static_cast<uint8_t>(v & 0x3);
79-
const uint8_t high_bits = ((v >> 6) & 0xf) + 1;
80-
*buf++ = static_cast<char>(0xf0 | (high_bits >> 2));
81-
*buf =
82-
static_cast<char>(0x80 | static_cast<uint8_t>((high_bits & 0x3) << 4) |
83-
static_cast<uint8_t>((v >> 2) & 0xf));
84-
return 2;
85-
} else if (v < 0xe000 && s.saw_high_surrogate) {
86-
*buf++ = static_cast<char>(0x80 | static_cast<uint8_t>(s.bits << 4) |
87-
static_cast<uint8_t>((v >> 6) & 0xf));
88-
*buf = static_cast<char>(0x80 | (v & 0x3f));
89-
s.saw_high_surrogate = false;
90-
s.bits = 0;
91-
return 2;
92-
} else {
93-
return static_cast<size_t>(-1);
88+
} else if (0xD800 <= v && v <= 0xDBFF) {
89+
// High Surrogate (U+D800 to U+DBFF).
90+
// This part forms the first two bytes of an eventual 4-byte UTF-8 sequence.
91+
const unsigned char high_bits_val = (0b0000'1111 & (v >> 6)) + 1;
92+
93+
// First byte of the 4-byte UTF-8 sequence (11110xxx).
94+
ubuf[0] = 0b1111'0000 | (0b0000'0111 & (high_bits_val >> 2));
95+
// Second byte of the 4-byte UTF-8 sequence (10xxxxxx).
96+
ubuf[1] = 0b1000'0000 | //
97+
(0b0011'0000 & (high_bits_val << 4)) | //
98+
(0b0000'1111 & (v >> 2));
99+
// Set state for high surrogate after writing to buffer.
100+
s = {true, static_cast<unsigned char>(0b0000'0011 & v)};
101+
return 2; // Wrote 2 bytes, expecting 2 more from a low surrogate.
102+
} else if (0xDC00 <= v && v <= 0xDFFF) {
103+
// Low Surrogate (U+DC00 to U+DFFF).
104+
// This part forms the last two bytes of a 4-byte UTF-8 sequence,
105+
// using state from a preceding high surrogate.
106+
if (!s.saw_high_surrogate) {
107+
// Error: Isolated low surrogate without a preceding high surrogate.
108+
// s remains in its current (problematic) state.
109+
// Caller should handle error.
110+
return kError;
111+
}
112+
113+
// Third byte of the 4-byte UTF-8 sequence (10xxxxxx).
114+
ubuf[0] = 0b1000'0000 | //
115+
(0b0011'0000 & (s.bits << 4)) | //
116+
(0b0000'1111 & (v >> 6));
117+
// Fourth byte of the 4-byte UTF-8 sequence (10xxxxxx).
118+
ubuf[1] = 0b1000'0000 | (0b0011'1111 & v);
119+
120+
s = {}; // Reset surrogate state, pair complete.
121+
return 2; // Wrote 2 more bytes, completing the 4-byte sequence.
122+
} else if constexpr (0xFFFF < std::numeric_limits<wchar_t>::max()) {
123+
// Conditionally compile the 4-byte direct conversion branch.
124+
// This block is compiled only if wchar_t can represent values > 0xFFFF.
125+
// It's placed after surrogate checks to ensure surrogates are handled by
126+
// their specific logic. This inner 'if' is the runtime check for the 4-byte
127+
// range. At this point, v is known not to be in the 1, 2, or 3-byte BMP
128+
// ranges, nor is it a surrogate code point.
129+
if (0x10000 <= v && v <= 0x10FFFF) {
130+
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
131+
ubuf[0] = 0b1111'0000 | (0b0000'0111 & (v >> 18));
132+
ubuf[1] = 0b1000'0000 | (0b0011'1111 & (v >> 12));
133+
ubuf[2] = 0b1000'0000 | (0b0011'1111 & (v >> 6));
134+
ubuf[3] = 0b1000'0000 | (0b0011'1111 & v);
135+
s = {}; // Reset surrogate state.
136+
return 4;
137+
}
94138
}
139+
140+
// Invalid wchar_t value (e.g., out of Unicode range, or unhandled after all
141+
// checks).
142+
s = {}; // Reset surrogate state.
143+
return kError;
95144
}
96145

97146
} // namespace strings_internal

absl/strings/internal/utf8.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,11 @@ namespace strings_internal {
4141
// characters into buffer, however never will more than kMaxEncodedUTF8Size
4242
// bytes be written, regardless of the value of utf8_char.
4343
enum { kMaxEncodedUTF8Size = 4 };
44-
size_t EncodeUTF8Char(char *buffer, char32_t utf8_char);
44+
size_t EncodeUTF8Char(char* buffer, char32_t utf8_char);
4545

4646
struct ShiftState {
4747
bool saw_high_surrogate = false;
48-
uint8_t bits = 0;
48+
unsigned char bits = 0;
4949
};
5050

5151
// Converts `wc` from UTF-16 or UTF-32 to UTF-8 and writes to `buf`. `buf` is
@@ -55,7 +55,7 @@ struct ShiftState {
5555
//
5656
// This is basically std::wcrtomb(), but always outputting UTF-8 instead of
5757
// respecting the current locale.
58-
size_t WideToUtf8(wchar_t wc, char *buf, ShiftState &s);
58+
size_t WideToUtf8(wchar_t wc, char* buf, ShiftState& s);
5959

6060
} // namespace strings_internal
6161
ABSL_NAMESPACE_END

absl/strings/internal/utf8_test.cc

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,21 @@ std::vector<WideToUtf8TestCase> GetWideToUtf8TestCases() {
103103
{"BMP_MaxBeforeSurrogates_D7FF", L'\uD7FF', "\xED\x9F\xBF", 3},
104104
{"BMP_FFFF", L'\uFFFF', "\xEF\xBF\xBF", 3},
105105

106-
{"IsolatedHighSurr_D800", L'\xD800', "\xF0\x90", 2, {true, 0}, {true, 0}},
107-
{"IsolatedHighSurr_DBFF", L'\xDBFF', "\xF4\x8F", 2, {true, 3}, {true, 3}},
106+
{"IsolatedHighSurr_D800", L'\xD800', "\xF0\x90", 2, {}, {true, 0}},
107+
{"IsolatedHighSurr_DBFF", L'\xDBFF', "\xF4\x8F", 2, {}, {true, 3}},
108+
109+
{"HighSurr_D800_after_HighD800",
110+
L'\xD800',
111+
"\xF0\x90",
112+
2,
113+
{true, 0},
114+
{true, 0}},
115+
{"HighSurr_DBFF_after_HighDBFF",
116+
L'\xDBFF',
117+
"\xF4\x8F",
118+
2,
119+
{true, 3},
120+
{true, 3}},
108121

109122
{"LowSurr_DC00_after_HighD800", L'\xDC00', "\x80\x80", 2, {true, 0}, {}},
110123
{"LowSurr_DFFD_after_HighDBFF", L'\xDFFD', "\xBF\xBD", 2, {true, 3}, {}},

0 commit comments

Comments
 (0)