Skip to content

Commit 0504ae1

Browse files
committed
feat(string): automatically convert strings containing surrogate pairs to UCS4
1 parent 2db4d63 commit 0504ae1

File tree

1 file changed

+25
-0
lines changed

1 file changed

+25
-0
lines changed

src/StrType.cc

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@
3333
#define PY_UNICODE_OBJECT_READY(op) (PY_ASCII_OBJECT_CAST(op)->state.ready)
3434
#endif
3535

36+
static bool containsSurrogatePair(const char16_t *chars, size_t length) {
37+
for (size_t i = 0; i < length; i++) {
38+
if (Py_UNICODE_IS_SURROGATE(chars[i])) {
39+
return true;
40+
}
41+
}
42+
return false;
43+
}
44+
3645
StrType::StrType(PyObject *object) : PyType(object) {}
3746

3847
StrType::StrType(char *string) : PyType(Py_BuildValue("s", string)) {}
@@ -91,6 +100,18 @@ StrType::StrType(JSContext *cx, JSString *str) {
91100
}
92101
PY_UNICODE_OBJECT_READY(pyObject) = 1;
93102
#endif
103+
104+
if (containsSurrogatePair(chars, length)) {
105+
// We must convert to UCS4 here because Python does not support decoding string containing surrogate pairs to bytes
106+
PyObject *ucs4Obj = this->asUCS4(); // convert `pyObject` to a new PyUnicodeObject with UCS4 data
107+
if (!ucs4Obj) {
108+
// conversion fails, keep the original `pyObject`
109+
PyErr_Clear();
110+
return;
111+
}
112+
Py_DECREF(pyObject); // cleanup the old `pyObject`
113+
pyObject = Py_NewRef(ucs4Obj);
114+
}
94115
}
95116
}
96117

@@ -99,6 +120,10 @@ const char *StrType::getValue() const {
99120
}
100121

101122
PyObject *StrType::asUCS4() {
123+
if (PyUnicode_KIND(pyObject) != PyUnicode_2BYTE_KIND) {
124+
return Py_NewRef(pyObject);
125+
}
126+
102127
uint16_t *chars = PY_UNICODE_OBJECT_DATA_UCS2(pyObject);
103128
size_t length = PY_UNICODE_OBJECT_LENGTH(pyObject);
104129

0 commit comments

Comments
 (0)