Skip to content

Commit f624778

Browse files
committed
refactor(string): convert surrogate pairs using Python-provided macros
1 parent 0504ae1 commit f624778

File tree

1 file changed

+4
-8
lines changed

1 file changed

+4
-8
lines changed

src/StrType.cc

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -131,21 +131,17 @@ PyObject *StrType::asUCS4() {
131131
size_t ucs4Length = 0;
132132

133133
for (size_t i = 0; i < length; i++) {
134-
if (chars[i] >= LOW_SURROGATE_START && chars[i] <= LOW_SURROGATE_END) // character is an unpaired low surrogate
134+
if (Py_UNICODE_IS_LOW_SURROGATE(chars[i])) // character is an unpaired low surrogate
135135
{
136136
char hexString[5];
137137
sprintf(hexString, "%x", (unsigned int)chars[i]);
138138
std::string errorString = std::string("string contains an unpaired low surrogate at position: ") + std::to_string(i) + std::string(" with a value of 0x") + hexString;
139139
PyErr_SetString(PyExc_UnicodeTranslateError, errorString.c_str());
140140
return NULL;
141141
}
142-
else if (chars[i] >= HIGH_SURROGATE_START && chars[i] <= HIGH_SURROGATE_END) { // character is a high surrogate
143-
if ((i + 1 < length) && chars[i+1] >= LOW_SURROGATE_START && chars[i+1] <= LOW_SURROGATE_END) { // next character is a low surrogate
144-
// see https://www.unicode.org/faq/utf_bom.html#utf16-3 for details
145-
uint32_t X = (chars[i] & ((1 << 6) -1)) << 10 | chars[i+1] & ((1 << 10) -1);
146-
uint32_t W = (chars[i] >> 6) & ((1 << 5) - 1);
147-
uint32_t U = W+1;
148-
ucs4String[ucs4Length] = U << 16 | X;
142+
else if (Py_UNICODE_IS_HIGH_SURROGATE(chars[i])) { // character is a high surrogate
143+
if ((i + 1 < length) && Py_UNICODE_IS_LOW_SURROGATE(chars[i+1])) { // next character is a low surrogate
144+
ucs4String[ucs4Length] = Py_UNICODE_JOIN_SURROGATES(chars[i], chars[i+1]);
149145
ucs4Length++;
150146
i++; // skip over low surrogate
151147
}

0 commit comments

Comments
 (0)