Merge pull request #98 from Distributive-Network/Xmader/fix/non-BMP-string

zollqir · web-flow · commit c552bfe3b265 · 2023-07-19T15:01:40.000-04:00
Fix non-BMP strings
diff --git a/include/StrType.hh b/include/StrType.hh
@@ -50,7 +50,7 @@ public:
    * @return PyObject* - the UCS4-encoding of the pyObject string
    *
    */
-  PyObject *asUCS4();
+  static PyObject *asUCS4(PyObject *pyObject);
 };
 
 #endif
diff --git a/include/modules/pythonmonkey/pythonmonkey.hh b/include/modules/pythonmonkey/pythonmonkey.hh
@@ -64,15 +64,6 @@ void handleSharedPythonMonkeyMemory(JSContext *cx, JSGCStatus status, JS::GCReas
  */
 static PyObject *collect(PyObject *self, PyObject *args);
 
-/**
- * @brief Function exposed by the python module to convert UTF16 strings to UCS4 strings
- *
- * @param self - Pointer to the module object
- * @param args - Pointer to the python tuple of arguments (expected to contain a UTF16-encoded string as the first element)
- * @return PyObject* - A new python string in UCS4 encoding
- */
-static PyObject *asUCS4(PyObject *self, PyObject *args);
-
 /**
  * @brief Function exposed by the python module for evaluating arbitrary JS code
  *
diff --git a/python/pythonmonkey/pythonmonkey.pyi b/python/pythonmonkey/pythonmonkey.pyi
@@ -16,15 +16,6 @@ def collect() -> None:
     Calls the spidermonkey garbage collector
     """
 
-@_typing.overload
-def asUCS4(utf16_str: str, /) -> str:
-    """
-    Expects a python string in UTF16 encoding, and returns a new equivalent string in UCS4.
-    Undefined behaviour if the string is not in UTF16.
-    """
-@_typing.overload
-def asUCS4(anything_else: _typing.Any, /) -> _typing.NoReturn: ...
-
 class bigint(int):
     """
     Representing JavaScript BigInt in Python
diff --git a/src/StrType.cc b/src/StrType.cc
@@ -33,6 +33,18 @@
   #define PY_UNICODE_OBJECT_READY(op)       (PY_ASCII_OBJECT_CAST(op)->state.ready)
 #endif
 
+/**
+ * @brief check if UTF-16 encoded `chars` contain a surrogate pair
+ */
+static bool containsSurrogatePair(const char16_t *chars, size_t length) {
+  for (size_t i = 0; i < length; i++) {
+    if (Py_UNICODE_IS_SURROGATE(chars[i])) {
+      return true;
+    }
+  }
+  return false;
+}
+
 StrType::StrType(PyObject *object) : PyType(object) {}
 
 StrType::StrType(char *string) : PyType(Py_BuildValue("s", string)) {}
@@ -91,50 +103,52 @@ StrType::StrType(JSContext *cx, JSString *str) {
     }
     PY_UNICODE_OBJECT_READY(pyObject) = 1;
   #endif
+
+    if (containsSurrogatePair(chars, length)) {
+      // We must convert to UCS4 here because Python does not support decoding string containing surrogate pairs to bytes
+      PyObject *ucs4Obj = asUCS4(pyObject); // convert to a new PyUnicodeObject with UCS4 data
+      if (!ucs4Obj) {
+        // conversion fails, keep the original `pyObject`
+        return;
+      }
+      Py_DECREF(pyObject); // cleanup the old `pyObject`
+      Py_INCREF(ucs4Obj); // XXX: Same as the above `Py_INCREF(pyObject);`. Why double freed on GC?
+      pyObject = ucs4Obj;
+    }
   }
 }
 
 const char *StrType::getValue() const {
   return PyUnicode_AsUTF8(pyObject);
 }
 
-PyObject *StrType::asUCS4() {
+/* static */
+PyObject *StrType::asUCS4(PyObject *pyObject) {
+  if (PyUnicode_KIND(pyObject) != PyUnicode_2BYTE_KIND) {
+    // return a new reference to match the behaviour of `PyUnicode_FromKindAndData`
+    Py_INCREF(pyObject);
+    return pyObject;
+  }
+
   uint16_t *chars = PY_UNICODE_OBJECT_DATA_UCS2(pyObject);
   size_t length = PY_UNICODE_OBJECT_LENGTH(pyObject);
 
   uint32_t ucs4String[length];
   size_t ucs4Length = 0;
 
-  for (size_t i = 0; i < length; i++) {
-    if (chars[i] >= LOW_SURROGATE_START && chars[i] <= LOW_SURROGATE_END) // character is an unpaired low surrogate
-    {
-      char hexString[5];
-      sprintf(hexString, "%x", (unsigned int)chars[i]);
-      std::string errorString = std::string("string contains an unpaired low surrogate at position: ") + std::to_string(i) + std::string(" with a value of 0x") + hexString;
-      PyErr_SetString(PyExc_UnicodeTranslateError, errorString.c_str());
+  for (size_t i = 0; i < length; i++, ucs4Length++) {
+    if (Py_UNICODE_IS_LOW_SURROGATE(chars[i])) { // character is an unpaired low surrogate
       return NULL;
-    }
-    else if (chars[i] >= HIGH_SURROGATE_START && chars[i] <= HIGH_SURROGATE_END) { // character is a high surrogate
-      if ((i + 1 < length) && chars[i+1] >= LOW_SURROGATE_START && chars[i+1] <= LOW_SURROGATE_END) { // next character is a low surrogate
-        // see https://www.unicode.org/faq/utf_bom.html#utf16-3 for details
-        uint32_t X = (chars[i] & ((1 << 6) -1)) << 10 | chars[i+1] & ((1 << 10) -1);
-        uint32_t W = (chars[i] >> 6) & ((1 << 5) - 1);
-        uint32_t U = W+1;
-        ucs4String[ucs4Length] = U << 16 | X;
-        ucs4Length++;
+    } else if (Py_UNICODE_IS_HIGH_SURROGATE(chars[i])) { // character is a high surrogate
+      if ((i + 1 < length) && Py_UNICODE_IS_LOW_SURROGATE(chars[i+1])) { // next character is a low surrogate
+        ucs4String[ucs4Length] = Py_UNICODE_JOIN_SURROGATES(chars[i], chars[i+1]);
         i++; // skip over low surrogate
       }
       else { // next character is not a low surrogate
-        char hexString[5];
-        sprintf(hexString, "%x", (unsigned int)chars[i]);
-        std::string errorString = std::string("string contains an unpaired high surrogate at position: ") + std::to_string(i) + std::string(" with a value of 0x") + hexString;
-        PyErr_SetString(PyExc_UnicodeTranslateError, errorString.c_str());
         return NULL;
       }
-    }
-    else { // character is not a surrogate, and is in the BMP
+    } else { // character is not a surrogate, and is in the BMP
       ucs4String[ucs4Length] = chars[i];
-      ucs4Length++;
     }
   }
 
diff --git a/src/modules/pythonmonkey/pythonmonkey.cc b/src/modules/pythonmonkey/pythonmonkey.cc
@@ -144,16 +144,6 @@ static PyObject *collect(PyObject *self, PyObject *args) {
   Py_RETURN_NONE;
 }
 
-static PyObject *asUCS4(PyObject *self, PyObject *args) {
-  StrType *str = new StrType(PyTuple_GetItem(args, 0));
-  if (!PyUnicode_Check(str->getPyObject())) {
-    PyErr_SetString(PyExc_TypeError, "pythonmonkey.asUCS4 expects a string as its first argument");
-    return NULL;
-  }
-
-  return str->asUCS4();
-}
-
 static bool getEvalOption(PyObject *evalOptions, const char *optionName, const char **s_p) {
   PyObject *value;
 
@@ -281,7 +271,6 @@ PyMethodDef PythonMonkeyMethods[] = {
   {"eval", eval, METH_VARARGS, "Javascript evaluator in Python"},
   {"isCompilableUnit", isCompilableUnit, METH_VARARGS, "Hint if a string might be compilable Javascript"},
   {"collect", collect, METH_VARARGS, "Calls the spidermonkey garbage collector"},
-  {"asUCS4", asUCS4, METH_VARARGS, "Expects a python string in UTF16 encoding, and returns a new equivalent string in UCS4. Undefined behaviour if the string is not in UTF16."},
   {NULL, NULL, 0, NULL}
 };
 
diff --git a/tests/python/test_pythonmonkey_eval.py b/tests/python/test_pythonmonkey_eval.py
@@ -225,7 +225,7 @@ def test_eval_functions_ucs4_string_args():
             codepoint = random.randint(0x010000, 0x10FFFF)
             string2 += chr(codepoint)
         
-        assert pm.asUCS4(concatenate(string1, string2)) == (string1 + string2)
+        assert concatenate(string1, string2) == (string1 + string2)
 
 def test_eval_functions_roundtrip():
     # BF-60 https://github.com/Distributive-Network/PythonMonkey/pull/18
diff --git a/tests/python/test_strings.py b/tests/python/test_strings.py
@@ -29,8 +29,7 @@ def test_eval_unpaired_surrogate_string_matches_evaluated_string():
 
 def test_eval_ucs4_string_matches_evaluated_string():
     py_ucs4_string = "🀄🀛🜢"
-    js_utf16_string = pm.eval(repr(py_ucs4_string))
-    js_ucs4_string = pm.asUCS4(js_utf16_string)
+    js_ucs4_string = pm.eval(repr(py_ucs4_string))
     assert py_ucs4_string == js_ucs4_string
 
 def test_eval_latin1_string_fuzztest():
@@ -111,8 +110,7 @@ def test_eval_ucs4_string_fuzztest():
         INITIAL_STRING = string1
         m = 10
         for _ in range(m):
-            utf16_string2 = pm.eval("'" + string1 + "'")
-            string2 = pm.asUCS4(utf16_string2)
+            string2 = pm.eval("'" + string1 + "'")
             assert len(string1) == length
             assert len(string2) == length
             assert len(string1) == len(string2)
@@ -158,8 +156,7 @@ def test_eval_boxed_unpaired_surrogate_string_matches_evaluated_string():
 
 def test_eval_boxed_ucs4_string_matches_evaluated_string():
     py_ucs4_string = "🀄🀛🜢"
-    js_utf16_string = pm.eval(f'new String({repr(py_ucs4_string)})')
-    js_ucs4_string = pm.asUCS4(js_utf16_string)
+    js_ucs4_string = pm.eval(f'new String({repr(py_ucs4_string)})')
     assert py_ucs4_string == js_ucs4_string
 
 def test_eval_boxed_latin1_string_fuzztest():
@@ -240,8 +237,7 @@ def test_eval_boxed_ucs4_string_fuzztest():
         INITIAL_STRING = string1
         m = 10
         for _ in range(m):
-            utf16_string2 = pm.eval(f'new String("{string1}")')
-            string2 = pm.asUCS4(utf16_string2)
+            string2 = pm.eval(f'new String("{string1}")')
             assert len(string1) == length
             assert len(string2) == length
             assert len(string1) == len(string2)

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ public:`
`50`	`50`	`* @return PyObject* - the UCS4-encoding of the pyObject string`
`51`	`51`	`*`
`52`	`52`	`*/`
`53`		`- PyObject *asUCS4();`
	`53`	`+ static PyObject asUCS4(PyObject pyObject);`
`54`	`54`	`};`
`55`	`55`
`56`	`56`	`#endif`