add StringDType to Unicode_ cast

ngoldbaum · ngoldbaum · commit cabab7f2b6c6 · 2023-01-17T12:17:05.000-07:00
diff --git a/stringdtype/stringdtype/src/casts.c b/stringdtype/stringdtype/src/casts.c
@@ -3,11 +3,11 @@
 #include "dtype.h"
 
 static NPY_CASTING
-string_resolve_descriptors(PyObject *NPY_UNUSED(self),
-                           PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
-                           PyArray_Descr *given_descrs[2],
-                           PyArray_Descr *loop_descrs[2],
-                           npy_intp *NPY_UNUSED(view_offset))
+string_to_string_resolve_descriptors(PyObject *NPY_UNUSED(self),
+                                     PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+                                     PyArray_Descr *given_descrs[2],
+                                     PyArray_Descr *loop_descrs[2],
+                                     npy_intp *NPY_UNUSED(view_offset))
 {
     Py_INCREF(given_descrs[0]);
     loop_descrs[0] = given_descrs[0];
@@ -50,7 +50,7 @@ string_to_string(PyArrayMethod_Context *context, char *const data[],
 static PyArray_DTypeMeta *s2s_dtypes[2] = {NULL, NULL};
 
 static PyType_Slot s2s_slots[] = {
-        {NPY_METH_resolve_descriptors, &string_resolve_descriptors},
+        {NPY_METH_resolve_descriptors, &string_to_string_resolve_descriptors},
         {NPY_METH_strided_loop, &string_to_string},
         {NPY_METH_unaligned_strided_loop, &string_to_string},
         {0, NULL}};
@@ -65,33 +65,57 @@ PyArrayMethod_Spec StringToStringCastSpec = {
         .slots = s2s_slots,
 };
 
+static NPY_CASTING
+unicode_to_string_resolve_descriptors(PyObject *NPY_UNUSED(self),
+                                      PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+                                      PyArray_Descr *given_descrs[2],
+                                      PyArray_Descr *loop_descrs[2],
+                                      npy_intp *NPY_UNUSED(view_offset))
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = (PyArray_Descr *)new_stringdtype_instance();
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    return NPY_SAFE_CASTING;
+}
+
 // converts UCS4 code point to 4-byte char* assumes in is a zero-filled 4 byte
-// array returns -1 if the code point is not a valid unicode code point, the
-// number of bytes in the in the UTF-8 character on success
+// array returns -1 if the code point is not a valid unicode code point,
+// returns the number of bytes in the UTF-8 character on success
 static int
-ucs4_to_utf8_char(const Py_UCS4 code, char *in)
+ucs4_code_to_utf8_char(const Py_UCS4 code, char *c)
 {
     if (code <= 0x7F) {
-        // ASCII
-        in[0] = (char)code;
+        // 0zzzzzzz -> 0zzzzzzz
+        c[0] = (char)code;
         return 1;
     }
     else if (code <= 0x07FF) {
-        in[0] = (0xc0 | (code >> 6));
-        in[1] = (0x80 | (code & 0x3f));
+        // 00000yyy yyzzzzzz -> 110yyyyy 10zzzzzz
+        c[0] = (0xC0 | (code >> 6));
+        c[1] = (0x80 | (code & 0x3F));
         return 2;
     }
     else if (code <= 0xFFFF) {
-        in[0] = (0xe0 | (code >> 12));
-        in[1] = (0x80 | ((code >> 6) & 0x3f));
-        in[2] = (0x80 | (code & 0x3f));
+        // xxxxyyyy yyzzzzzz -> 110yyyyy 10zzzzzz
+        c[0] = (0xe0 | (code >> 12));
+        c[1] = (0x80 | ((code >> 6) & 0x3f));
+        c[2] = (0x80 | (code & 0x3f));
         return 3;
     }
     else if (code <= 0x10FFFF) {
-        in[0] = (0xf0 | (code >> 18));
-        in[1] = (0x80 | ((code >> 12) & 0x3f));
-        in[2] = (0x80 | ((code >> 6) & 0x3f));
-        in[3] = (0x80 | (code & 0x3f));
+        // 00wwwxx xxxxyyyy yyzzzzzz -> 11110www 10xxxxxx 10yyyyyy 10zzzzzz
+        c[0] = (0xf0 | (code >> 18));
+        c[1] = (0x80 | ((code >> 12) & 0x3f));
+        c[2] = (0x80 | ((code >> 6) & 0x3f));
+        c[3] = (0x80 | (code & 0x3f));
         return 4;
     }
     return -1;
@@ -106,28 +130,31 @@ unicode_to_string(PyArrayMethod_Context *context, char *const data[],
     long in_size = (descrs[0]->elsize) / 4;
 
     npy_intp N = dimensions[0];
-    char *in = data[0];
+    Py_UCS4 *in = (Py_UCS4 *)data[0];
     char **out = (char **)data[1];
-    npy_intp in_stride = strides[0];
+
+    // 4 bytes per UCS4 character
+    npy_intp in_stride = strides[0] / 4;
     // strides are in bytes but pointer offsets are in pointer widths, so
     // divide by the element size (one pointer width) to get the pointer offset
     npy_intp out_stride = strides[1] / context->descriptors[1]->elsize;
 
     while (N--) {
         // pessimistically allocate 4 bytes per allowed character
-        char *out_buf = calloc(in_size * 4 + 1, sizeof(char));
+        // plus one byte for the null terminator
+        char *out_buf = malloc((in_size * 4 + 1) * sizeof(char));
         size_t out_num_bytes = 0;
         for (int i = 0; i < in_size; i++) {
             // get code point
-            Py_UCS4 code = ((Py_UCS4 *)in)[i];
+            Py_UCS4 code = in[i];
 
             if (code == 0) {
                 break;
             }
 
             // convert codepoint to UTF8 bytes
             char utf8_c[4] = {0};
-            size_t num_bytes = ucs4_to_utf8_char(code, utf8_c);
+            size_t num_bytes = ucs4_code_to_utf8_char(code, utf8_c);
             out_num_bytes += num_bytes;
 
             if (num_bytes == -1) {
@@ -159,7 +186,6 @@ unicode_to_string(PyArrayMethod_Context *context, char *const data[],
         // set out to the address of the beginning of the string
         out[0] = out_buf;
 
-        // increment out and in by strides
         in += in_stride;
         out += out_stride;
     }
@@ -168,12 +194,126 @@ unicode_to_string(PyArrayMethod_Context *context, char *const data[],
 }
 
 static PyType_Slot u2s_slots[] = {
-        {NPY_METH_resolve_descriptors, &string_resolve_descriptors},
+        {NPY_METH_resolve_descriptors, &unicode_to_string_resolve_descriptors},
         {NPY_METH_strided_loop, &unicode_to_string},
         {0, NULL}};
 
 static char *u2s_name = "cast_Unicode_to_StringDType";
 
+static NPY_CASTING
+string_to_unicode_resolve_descriptors(PyObject *NPY_UNUSED(self),
+                                      PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+                                      PyArray_Descr *given_descrs[2],
+                                      PyArray_Descr *loop_descrs[2],
+                                      npy_intp *NPY_UNUSED(view_offset))
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        // currently there's no way to determine the correct output
+        // size, so set an error and bail
+        PyErr_SetString(
+                PyExc_TypeError,
+                "Casting from StringDType to a fixed-width dtype with an "
+                "unspecified size is not currently supported, specify "
+                "an explicit size for the output dtype instead.");
+        return (NPY_CASTING)-1;
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    return NPY_UNSAFE_CASTING;
+}
+
+// Given UTF-8 bytes in *c*, sets *codepoint* to the corresponding unicode
+// codepoint for the next character, returning the size of the character in
+// bytes. Does not do any validation or error checking: assumes *c* is valid
+// utf-8
+static size_t
+utf8_char_to_ucs4_code(unsigned char *c, Py_UCS4 *code)
+{
+    if (c[0] <= 0x7F) {
+        // 0zzzzzzz -> 0zzzzzzz
+        *code = (Py_UCS4)(c[0]);
+        return 1;
+    }
+    else if (c[0] <= 0xDF) {
+        // 110yyyyy 10zzzzzz -> 00000yyy yyzzzzzz
+        *code = (Py_UCS4)(((c[0] << 6) + c[1]) - ((0xC0 << 6) + 0x80));
+        return 2;
+    }
+    else if (c[0] <= 0xEF) {
+        // 1110xxxx 10yyyyyy 10zzzzzz -> xxxxyyyy yyzzzzzz
+        *code = (Py_UCS4)(((c[0] << 12) + (c[1] << 6) + c[2]) -
+                          ((0xE0 << 12) + (0x80 << 6) + 0x80));
+        return 3;
+    }
+    else {
+        // 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+        *code = (Py_UCS4)(((c[0] << 18) + (c[1] << 12) + (c[2] << 6) + c[3]) -
+                          ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80));
+        return 4;
+    }
+}
+
+static int
+string_to_unicode(PyArrayMethod_Context *context, char *const data[],
+                  npy_intp const dimensions[], npy_intp const strides[],
+                  NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char **in = (char **)data[0];
+    Py_UCS4 *out = (Py_UCS4 *)data[1];
+    // strides are in bytes but pointer offsets are in pointer widths, so
+    // divide by the element size (one pointer width) to get the pointer offset
+    npy_intp in_stride = strides[0] / context->descriptors[0]->elsize;
+    // 4 bytes per UCS4 character
+    npy_intp out_stride = strides[1] / 4;
+    // max number of 4 byte UCS4 characters that can fit in the output
+    long max_out_size = (context->descriptors[1]->elsize) / 4;
+
+    while (N--) {
+        unsigned char *this_string = (unsigned char *)*in;
+
+        for (int i = 0; i < max_out_size; i++) {
+            Py_UCS4 code;
+
+            // get code point for character this_string is currently pointing
+            // too
+            size_t num_bytes = utf8_char_to_ucs4_code(this_string, &code);
+
+            // move to next character
+            this_string += num_bytes;
+
+            // set output codepoint
+            out[i] = code;
+
+            // check if this is the null terminator
+            if (code == 0) {
+                // fill all remaining characters (if any) with zero
+                for (int j = i + 1; j < max_out_size; j++) {
+                    out[j] = 0;
+                }
+                break;
+            }
+        }
+        in += in_stride;
+        out += out_stride;
+    }
+
+    return 0;
+}
+
+static PyType_Slot s2u_slots[] = {
+        {NPY_METH_resolve_descriptors, &string_to_unicode_resolve_descriptors},
+        {NPY_METH_strided_loop, &string_to_unicode},
+        {0, NULL}};
+
+static char *s2u_name = "cast_StringDType_to_Unicode";
+
 PyArrayMethod_Spec **
 get_casts(void)
 {
@@ -192,10 +332,26 @@ get_casts(void)
     UnicodeToStringCastSpec->dtypes = u2s_dtypes;
     UnicodeToStringCastSpec->slots = u2s_slots;
 
-    PyArrayMethod_Spec **casts = malloc(3 * sizeof(PyArrayMethod_Spec *));
+    PyArray_DTypeMeta **s2u_dtypes = malloc(2 * sizeof(PyArray_DTypeMeta *));
+    s2u_dtypes[0] = NULL;
+    s2u_dtypes[1] = &PyArray_UnicodeDType;
+
+    PyArrayMethod_Spec *StringToUnicodeCastSpec =
+            malloc(sizeof(PyArrayMethod_Spec));
+
+    StringToUnicodeCastSpec->name = s2u_name;
+    StringToUnicodeCastSpec->nin = 1;
+    StringToUnicodeCastSpec->nout = 1;
+    StringToUnicodeCastSpec->casting = NPY_SAFE_CASTING;
+    StringToUnicodeCastSpec->flags = NPY_METH_NO_FLOATINGPOINT_ERRORS;
+    StringToUnicodeCastSpec->dtypes = s2u_dtypes;
+    StringToUnicodeCastSpec->slots = s2u_slots;
+
+    PyArrayMethod_Spec **casts = malloc(4 * sizeof(PyArrayMethod_Spec *));
     casts[0] = &StringToStringCastSpec;
     casts[1] = UnicodeToStringCastSpec;
-    casts[2] = NULL;
+    casts[2] = StringToUnicodeCastSpec;
+    casts[3] = NULL;
 
     return casts;
 }
diff --git a/stringdtype/stringdtype/src/dtype.c b/stringdtype/stringdtype/src/dtype.c
@@ -228,6 +228,8 @@ init_string_dtype(void)
 
     free(StringDType_DTypeSpec.casts[1]->dtypes);
     free(StringDType_DTypeSpec.casts[1]);
+    free(StringDType_DTypeSpec.casts[2]->dtypes);
+    free(StringDType_DTypeSpec.casts[2]);
     free(StringDType_DTypeSpec.casts);
 
     return 0;
diff --git a/stringdtype/tests/test_stringdtype.py b/stringdtype/tests/test_stringdtype.py
@@ -62,38 +62,26 @@ def test_bad_scalars(data):
     [
         ["this", "is", "an", "array"],
         ["€", "", "😊"],
+        ["A¢☃€ 😊", " A☃€¢😊", "☃€😊 A¢", "😊☃A¢ €"],
     ],
 )
-def test_cast_to_stringdtype(string_list):
+def test_unicode_casts(string_list):
     arr = np.array(string_list, dtype=np.unicode_).astype(StringDType())
     expected = np.array(string_list, dtype=StringDType())
     np.testing.assert_array_equal(arr, expected)
 
-
-@pytest.mark.xfail(reason="Not yet implemented")
-def test_cast_to_unicode_safe(string_list):
     arr = np.array(string_list, dtype=StringDType())
 
     np.testing.assert_array_equal(
-        arr.astype("<U3", casting="safe"), np.array(string_list, dtype="<U3")
+        arr.astype("U8"), np.array(string_list, dtype="U8")
     )
-
-    # Safe casting should preserve data
-    with pytest.raises(TypeError):
-        arr.astype("<U2", casting="safe")
-
-
-@pytest.mark.xfail(reason="Not yet implemented")
-def test_cast_to_unicode_unsafe(string_list):
-    arr = np.array(string_list, dtype=StringDType())
-
+    np.testing.assert_array_equal(arr.astype("U8").astype(StringDType()), arr)
     np.testing.assert_array_equal(
-        arr.astype("<U3", casting="unsafe"), np.array(string_list, dtype="<U3")
+        arr.astype("U3"), np.array(string_list, dtype="U3")
     )
-
-    # Unsafe casting: each element is truncated
     np.testing.assert_array_equal(
-        arr.astype("<U2", casting="unsafe"), np.array(string_list, dtype="<U2")
+        arr.astype("U3").astype(StringDType()),
+        np.array([s[:3] for s in string_list], dtype=StringDType()),
     )