|
33 | 33 | #define PY_UNICODE_OBJECT_READY(op) (PY_ASCII_OBJECT_CAST(op)->state.ready) |
34 | 34 | #endif |
35 | 35 |
|
| 36 | +/** |
| 37 | + * @brief check if UTF-16 encoded `chars` contain a surrogate pair |
| 38 | + */ |
| 39 | +static bool containsSurrogatePair(const char16_t *chars, size_t length) { |
| 40 | + for (size_t i = 0; i < length; i++) { |
| 41 | + if (Py_UNICODE_IS_SURROGATE(chars[i])) { |
| 42 | + return true; |
| 43 | + } |
| 44 | + } |
| 45 | + return false; |
| 46 | +} |
| 47 | + |
36 | 48 | StrType::StrType(PyObject *object) : PyType(object) {} |
37 | 49 |
|
38 | 50 | StrType::StrType(char *string) : PyType(Py_BuildValue("s", string)) {} |
@@ -91,50 +103,52 @@ StrType::StrType(JSContext *cx, JSString *str) { |
91 | 103 | } |
92 | 104 | PY_UNICODE_OBJECT_READY(pyObject) = 1; |
93 | 105 | #endif |
| 106 | + |
| 107 | + if (containsSurrogatePair(chars, length)) { |
| 108 | + // We must convert to UCS4 here because Python does not support decoding string containing surrogate pairs to bytes |
| 109 | + PyObject *ucs4Obj = asUCS4(pyObject); // convert to a new PyUnicodeObject with UCS4 data |
| 110 | + if (!ucs4Obj) { |
| 111 | + // conversion fails, keep the original `pyObject` |
| 112 | + return; |
| 113 | + } |
| 114 | + Py_DECREF(pyObject); // cleanup the old `pyObject` |
| 115 | + Py_INCREF(ucs4Obj); // XXX: Same as the above `Py_INCREF(pyObject);`. Why double freed on GC? |
| 116 | + pyObject = ucs4Obj; |
| 117 | + } |
94 | 118 | } |
95 | 119 | } |
96 | 120 |
|
97 | 121 | const char *StrType::getValue() const { |
98 | 122 | return PyUnicode_AsUTF8(pyObject); |
99 | 123 | } |
100 | 124 |
|
101 | | -PyObject *StrType::asUCS4() { |
| 125 | +/* static */ |
| 126 | +PyObject *StrType::asUCS4(PyObject *pyObject) { |
| 127 | + if (PyUnicode_KIND(pyObject) != PyUnicode_2BYTE_KIND) { |
| 128 | + // return a new reference to match the behaviour of `PyUnicode_FromKindAndData` |
| 129 | + Py_INCREF(pyObject); |
| 130 | + return pyObject; |
| 131 | + } |
| 132 | + |
102 | 133 | uint16_t *chars = PY_UNICODE_OBJECT_DATA_UCS2(pyObject); |
103 | 134 | size_t length = PY_UNICODE_OBJECT_LENGTH(pyObject); |
104 | 135 |
|
105 | 136 | uint32_t ucs4String[length]; |
106 | 137 | size_t ucs4Length = 0; |
107 | 138 |
|
108 | | - for (size_t i = 0; i < length; i++) { |
109 | | - if (chars[i] >= LOW_SURROGATE_START && chars[i] <= LOW_SURROGATE_END) // character is an unpaired low surrogate |
110 | | - { |
111 | | - char hexString[5]; |
112 | | - sprintf(hexString, "%x", (unsigned int)chars[i]); |
113 | | - std::string errorString = std::string("string contains an unpaired low surrogate at position: ") + std::to_string(i) + std::string(" with a value of 0x") + hexString; |
114 | | - PyErr_SetString(PyExc_UnicodeTranslateError, errorString.c_str()); |
| 139 | + for (size_t i = 0; i < length; i++, ucs4Length++) { |
| 140 | + if (Py_UNICODE_IS_LOW_SURROGATE(chars[i])) { // character is an unpaired low surrogate |
115 | 141 | return NULL; |
116 | | - } |
117 | | - else if (chars[i] >= HIGH_SURROGATE_START && chars[i] <= HIGH_SURROGATE_END) { // character is a high surrogate |
118 | | - if ((i + 1 < length) && chars[i+1] >= LOW_SURROGATE_START && chars[i+1] <= LOW_SURROGATE_END) { // next character is a low surrogate |
119 | | - // see https://www.unicode.org/faq/utf_bom.html#utf16-3 for details |
120 | | - uint32_t X = (chars[i] & ((1 << 6) -1)) << 10 | chars[i+1] & ((1 << 10) -1); |
121 | | - uint32_t W = (chars[i] >> 6) & ((1 << 5) - 1); |
122 | | - uint32_t U = W+1; |
123 | | - ucs4String[ucs4Length] = U << 16 | X; |
124 | | - ucs4Length++; |
| 142 | + } else if (Py_UNICODE_IS_HIGH_SURROGATE(chars[i])) { // character is a high surrogate |
| 143 | + if ((i + 1 < length) && Py_UNICODE_IS_LOW_SURROGATE(chars[i+1])) { // next character is a low surrogate |
| 144 | + ucs4String[ucs4Length] = Py_UNICODE_JOIN_SURROGATES(chars[i], chars[i+1]); |
125 | 145 | i++; // skip over low surrogate |
126 | 146 | } |
127 | 147 | else { // next character is not a low surrogate |
128 | | - char hexString[5]; |
129 | | - sprintf(hexString, "%x", (unsigned int)chars[i]); |
130 | | - std::string errorString = std::string("string contains an unpaired high surrogate at position: ") + std::to_string(i) + std::string(" with a value of 0x") + hexString; |
131 | | - PyErr_SetString(PyExc_UnicodeTranslateError, errorString.c_str()); |
132 | 148 | return NULL; |
133 | 149 | } |
134 | | - } |
135 | | - else { // character is not a surrogate, and is in the BMP |
| 150 | + } else { // character is not a surrogate, and is in the BMP |
136 | 151 | ucs4String[ucs4Length] = chars[i]; |
137 | | - ucs4Length++; |
138 | 152 | } |
139 | 153 | } |
140 | 154 |
|
|
0 commit comments