Skip to content

Commit c552bfe

Browse files
authored
Merge pull request #98 from Distributive-Network/Xmader/fix/non-BMP-string
Fix non-BMP strings
2 parents 48623e2 + f5a3846 commit c552bfe

File tree

7 files changed

+44
-63
lines changed

7 files changed

+44
-63
lines changed

include/StrType.hh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ public:
5050
* @return PyObject* - the UCS4-encoding of the pyObject string
5151
*
5252
*/
53-
PyObject *asUCS4();
53+
static PyObject *asUCS4(PyObject *pyObject);
5454
};
5555

5656
#endif

include/modules/pythonmonkey/pythonmonkey.hh

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,6 @@ void handleSharedPythonMonkeyMemory(JSContext *cx, JSGCStatus status, JS::GCReas
6464
*/
6565
static PyObject *collect(PyObject *self, PyObject *args);
6666

67-
/**
68-
* @brief Function exposed by the python module to convert UTF16 strings to UCS4 strings
69-
*
70-
* @param self - Pointer to the module object
71-
* @param args - Pointer to the python tuple of arguments (expected to contain a UTF16-encoded string as the first element)
72-
* @return PyObject* - A new python string in UCS4 encoding
73-
*/
74-
static PyObject *asUCS4(PyObject *self, PyObject *args);
75-
7667
/**
7768
* @brief Function exposed by the python module for evaluating arbitrary JS code
7869
*

python/pythonmonkey/pythonmonkey.pyi

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,6 @@ def collect() -> None:
1616
Calls the spidermonkey garbage collector
1717
"""
1818

19-
@_typing.overload
20-
def asUCS4(utf16_str: str, /) -> str:
21-
"""
22-
Expects a python string in UTF16 encoding, and returns a new equivalent string in UCS4.
23-
Undefined behaviour if the string is not in UTF16.
24-
"""
25-
@_typing.overload
26-
def asUCS4(anything_else: _typing.Any, /) -> _typing.NoReturn: ...
27-
2819
class bigint(int):
2920
"""
3021
Representing JavaScript BigInt in Python

src/StrType.cc

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,18 @@
3333
#define PY_UNICODE_OBJECT_READY(op) (PY_ASCII_OBJECT_CAST(op)->state.ready)
3434
#endif
3535

36+
/**
37+
* @brief check if UTF-16 encoded `chars` contain a surrogate pair
38+
*/
39+
static bool containsSurrogatePair(const char16_t *chars, size_t length) {
40+
for (size_t i = 0; i < length; i++) {
41+
if (Py_UNICODE_IS_SURROGATE(chars[i])) {
42+
return true;
43+
}
44+
}
45+
return false;
46+
}
47+
3648
StrType::StrType(PyObject *object) : PyType(object) {}
3749

3850
StrType::StrType(char *string) : PyType(Py_BuildValue("s", string)) {}
@@ -91,50 +103,52 @@ StrType::StrType(JSContext *cx, JSString *str) {
91103
}
92104
PY_UNICODE_OBJECT_READY(pyObject) = 1;
93105
#endif
106+
107+
if (containsSurrogatePair(chars, length)) {
108+
// We must convert to UCS4 here because Python does not support decoding string containing surrogate pairs to bytes
109+
PyObject *ucs4Obj = asUCS4(pyObject); // convert to a new PyUnicodeObject with UCS4 data
110+
if (!ucs4Obj) {
111+
// conversion fails, keep the original `pyObject`
112+
return;
113+
}
114+
Py_DECREF(pyObject); // cleanup the old `pyObject`
115+
Py_INCREF(ucs4Obj); // XXX: Same as the above `Py_INCREF(pyObject);`. Why double freed on GC?
116+
pyObject = ucs4Obj;
117+
}
94118
}
95119
}
96120

97121
const char *StrType::getValue() const {
98122
return PyUnicode_AsUTF8(pyObject);
99123
}
100124

101-
PyObject *StrType::asUCS4() {
125+
/* static */
126+
PyObject *StrType::asUCS4(PyObject *pyObject) {
127+
if (PyUnicode_KIND(pyObject) != PyUnicode_2BYTE_KIND) {
128+
// return a new reference to match the behaviour of `PyUnicode_FromKindAndData`
129+
Py_INCREF(pyObject);
130+
return pyObject;
131+
}
132+
102133
uint16_t *chars = PY_UNICODE_OBJECT_DATA_UCS2(pyObject);
103134
size_t length = PY_UNICODE_OBJECT_LENGTH(pyObject);
104135

105136
uint32_t ucs4String[length];
106137
size_t ucs4Length = 0;
107138

108-
for (size_t i = 0; i < length; i++) {
109-
if (chars[i] >= LOW_SURROGATE_START && chars[i] <= LOW_SURROGATE_END) // character is an unpaired low surrogate
110-
{
111-
char hexString[5];
112-
sprintf(hexString, "%x", (unsigned int)chars[i]);
113-
std::string errorString = std::string("string contains an unpaired low surrogate at position: ") + std::to_string(i) + std::string(" with a value of 0x") + hexString;
114-
PyErr_SetString(PyExc_UnicodeTranslateError, errorString.c_str());
139+
for (size_t i = 0; i < length; i++, ucs4Length++) {
140+
if (Py_UNICODE_IS_LOW_SURROGATE(chars[i])) { // character is an unpaired low surrogate
115141
return NULL;
116-
}
117-
else if (chars[i] >= HIGH_SURROGATE_START && chars[i] <= HIGH_SURROGATE_END) { // character is a high surrogate
118-
if ((i + 1 < length) && chars[i+1] >= LOW_SURROGATE_START && chars[i+1] <= LOW_SURROGATE_END) { // next character is a low surrogate
119-
// see https://www.unicode.org/faq/utf_bom.html#utf16-3 for details
120-
uint32_t X = (chars[i] & ((1 << 6) -1)) << 10 | chars[i+1] & ((1 << 10) -1);
121-
uint32_t W = (chars[i] >> 6) & ((1 << 5) - 1);
122-
uint32_t U = W+1;
123-
ucs4String[ucs4Length] = U << 16 | X;
124-
ucs4Length++;
142+
} else if (Py_UNICODE_IS_HIGH_SURROGATE(chars[i])) { // character is a high surrogate
143+
if ((i + 1 < length) && Py_UNICODE_IS_LOW_SURROGATE(chars[i+1])) { // next character is a low surrogate
144+
ucs4String[ucs4Length] = Py_UNICODE_JOIN_SURROGATES(chars[i], chars[i+1]);
125145
i++; // skip over low surrogate
126146
}
127147
else { // next character is not a low surrogate
128-
char hexString[5];
129-
sprintf(hexString, "%x", (unsigned int)chars[i]);
130-
std::string errorString = std::string("string contains an unpaired high surrogate at position: ") + std::to_string(i) + std::string(" with a value of 0x") + hexString;
131-
PyErr_SetString(PyExc_UnicodeTranslateError, errorString.c_str());
132148
return NULL;
133149
}
134-
}
135-
else { // character is not a surrogate, and is in the BMP
150+
} else { // character is not a surrogate, and is in the BMP
136151
ucs4String[ucs4Length] = chars[i];
137-
ucs4Length++;
138152
}
139153
}
140154

src/modules/pythonmonkey/pythonmonkey.cc

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -144,16 +144,6 @@ static PyObject *collect(PyObject *self, PyObject *args) {
144144
Py_RETURN_NONE;
145145
}
146146

147-
static PyObject *asUCS4(PyObject *self, PyObject *args) {
148-
StrType *str = new StrType(PyTuple_GetItem(args, 0));
149-
if (!PyUnicode_Check(str->getPyObject())) {
150-
PyErr_SetString(PyExc_TypeError, "pythonmonkey.asUCS4 expects a string as its first argument");
151-
return NULL;
152-
}
153-
154-
return str->asUCS4();
155-
}
156-
157147
static bool getEvalOption(PyObject *evalOptions, const char *optionName, const char **s_p) {
158148
PyObject *value;
159149

@@ -281,7 +271,6 @@ PyMethodDef PythonMonkeyMethods[] = {
281271
{"eval", eval, METH_VARARGS, "Javascript evaluator in Python"},
282272
{"isCompilableUnit", isCompilableUnit, METH_VARARGS, "Hint if a string might be compilable Javascript"},
283273
{"collect", collect, METH_VARARGS, "Calls the spidermonkey garbage collector"},
284-
{"asUCS4", asUCS4, METH_VARARGS, "Expects a python string in UTF16 encoding, and returns a new equivalent string in UCS4. Undefined behaviour if the string is not in UTF16."},
285274
{NULL, NULL, 0, NULL}
286275
};
287276

tests/python/test_pythonmonkey_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ def test_eval_functions_ucs4_string_args():
225225
codepoint = random.randint(0x010000, 0x10FFFF)
226226
string2 += chr(codepoint)
227227

228-
assert pm.asUCS4(concatenate(string1, string2)) == (string1 + string2)
228+
assert concatenate(string1, string2) == (string1 + string2)
229229

230230
def test_eval_functions_roundtrip():
231231
# BF-60 https://github.com/Distributive-Network/PythonMonkey/pull/18

tests/python/test_strings.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@ def test_eval_unpaired_surrogate_string_matches_evaluated_string():
2929

3030
def test_eval_ucs4_string_matches_evaluated_string():
3131
py_ucs4_string = "🀄🀛🜢"
32-
js_utf16_string = pm.eval(repr(py_ucs4_string))
33-
js_ucs4_string = pm.asUCS4(js_utf16_string)
32+
js_ucs4_string = pm.eval(repr(py_ucs4_string))
3433
assert py_ucs4_string == js_ucs4_string
3534

3635
def test_eval_latin1_string_fuzztest():
@@ -111,8 +110,7 @@ def test_eval_ucs4_string_fuzztest():
111110
INITIAL_STRING = string1
112111
m = 10
113112
for _ in range(m):
114-
utf16_string2 = pm.eval("'" + string1 + "'")
115-
string2 = pm.asUCS4(utf16_string2)
113+
string2 = pm.eval("'" + string1 + "'")
116114
assert len(string1) == length
117115
assert len(string2) == length
118116
assert len(string1) == len(string2)
@@ -158,8 +156,7 @@ def test_eval_boxed_unpaired_surrogate_string_matches_evaluated_string():
158156

159157
def test_eval_boxed_ucs4_string_matches_evaluated_string():
160158
py_ucs4_string = "🀄🀛🜢"
161-
js_utf16_string = pm.eval(f'new String({repr(py_ucs4_string)})')
162-
js_ucs4_string = pm.asUCS4(js_utf16_string)
159+
js_ucs4_string = pm.eval(f'new String({repr(py_ucs4_string)})')
163160
assert py_ucs4_string == js_ucs4_string
164161

165162
def test_eval_boxed_latin1_string_fuzztest():
@@ -240,8 +237,7 @@ def test_eval_boxed_ucs4_string_fuzztest():
240237
INITIAL_STRING = string1
241238
m = 10
242239
for _ in range(m):
243-
utf16_string2 = pm.eval(f'new String("{string1}")')
244-
string2 = pm.asUCS4(utf16_string2)
240+
string2 = pm.eval(f'new String("{string1}")')
245241
assert len(string1) == length
246242
assert len(string2) == length
247243
assert len(string1) == len(string2)

0 commit comments

Comments
 (0)