python · davidhewitt · Mar 10, 2023 · davidhewitt · Mar 10, 2023
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
@@ -97,46 +97,41 @@ typedef struct {
     PyObject_HEAD
     Py_ssize_t length;          /* Number of code points in the string */
     Py_hash_t hash;             /* Hash value; -1 if not set */
-    struct {
-        /* If interned is set, the two references from the
-           dictionary to this object are *not* counted in ob_refcnt. */
-        unsigned int interned:1;
-        /* Character size:
-
-           - PyUnicode_1BYTE_KIND (1):
-
-             * character type = Py_UCS1 (8 bits, unsigned)
-             * all characters are in the range U+0000-U+00FF (latin1)
-             * if ascii is set, all characters are in the range U+0000-U+007F
-               (ASCII), otherwise at least one character is in the range
-               U+0080-U+00FF
-
-           - PyUnicode_2BYTE_KIND (2):
-
-             * character type = Py_UCS2 (16 bits, unsigned)
-             * all characters are in the range U+0000-U+FFFF (BMP)
-             * at least one character is in the range U+0100-U+FFFF
-
-           - PyUnicode_4BYTE_KIND (4):
-
-             * character type = Py_UCS4 (32 bits, unsigned)
-             * all characters are in the range U+0000-U+10FFFF
-             * at least one character is in the range U+10000-U+10FFFF
-         */
-        unsigned int kind:3;
-        /* Compact is with respect to the allocation scheme. Compact unicode
-           objects only require one memory block while non-compact objects use
-           one block for the PyUnicodeObject struct and another for its data
-           buffer. */
-        unsigned int compact:1;
-        /* The string only contains characters in the range U+0000-U+007F (ASCII)
-           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
-           set, use the PyASCIIObject structure. */
-        unsigned int ascii:1;
-        /* Padding to ensure that PyUnicode_DATA() is always aligned to
-           4 bytes (see issue #19537 on m68k). */
-        unsigned int :26;
-    } state;
+    /* If interned is set, the two references from the
+        dictionary to this object are *not* counted in ob_refcnt. */
+    uint8_t interned;
+    /* Character size:
+
+        - PyUnicode_1BYTE_KIND (1):
+
+            * character type = Py_UCS1 (8 bits, unsigned)
+            * all characters are in the range U+0000-U+00FF (latin1)
+            * if ascii is set, all characters are in the range U+0000-U+007F
+            (ASCII), otherwise at least one character is in the range
+            U+0080-U+00FF
+
+        - PyUnicode_2BYTE_KIND (2):
+
+            * character type = Py_UCS2 (16 bits, unsigned)
+            * all characters are in the range U+0000-U+FFFF (BMP)
+            * at least one character is in the range U+0100-U+FFFF
+
+        - PyUnicode_4BYTE_KIND (4):
+
+            * character type = Py_UCS4 (32 bits, unsigned)
+            * all characters are in the range U+0000-U+10FFFF
+            * at least one character is in the range U+10000-U+10FFFF
+        */
+    uint8_t kind;
+    /* Compact is with respect to the allocation scheme. Compact unicode
+        objects only require one memory block while non-compact objects use
+        one block for the PyUnicodeObject struct and another for its data
+        buffer. */
+    uint8_t compact;
+    /* The string only contains characters in the range U+0000-U+007F (ASCII)
+        and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
+        set, use the PyASCIIObject structure. */
+    uint8_t ascii;
 } PyASCIIObject;
 
 /* Non-ASCII strings allocated through PyUnicode_New use the
@@ -178,15 +173,9 @@ PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
 
 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
 
-/* Values for PyASCIIObject.state: */
-
-/* Interning state. */
-#define SSTATE_NOT_INTERNED 0
-#define SSTATE_INTERNED_MORTAL 1
-
 /* Use only if you know it's a string */
 static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
-    return _PyASCIIObject_CAST(op)->state.interned;
+    return _PyASCIIObject_CAST(op)->interned;
 }
 #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
 
@@ -200,21 +189,21 @@ static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
    string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
    ready. */
 static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
-    return _PyASCIIObject_CAST(op)->state.ascii;
+    return _PyASCIIObject_CAST(op)->ascii;
 }
 #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
 
 /* Return true if the string is compact or 0 if not.
    No type checks or Ready calls are performed. */
 static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
-    return _PyASCIIObject_CAST(op)->state.compact;
+    return _PyASCIIObject_CAST(op)->compact;
 }
 #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
 
 /* Return true if the string is a compact ASCII string (use PyASCIIObject
    structure), or 0 if not.  No type checks or Ready calls are performed. */
 static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
-    return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
+    return (_PyASCIIObject_CAST(op)->ascii && PyUnicode_IS_COMPACT(op));
 }
 #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
 
@@ -231,7 +220,7 @@ enum PyUnicode_Kind {
 // new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
 // unsigned numbers) where kind type is an int or on
 // "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
-#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
+#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->kind)
 
 /* Return a void pointer to the raw unicode buffer. */
 static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {

@@ -155,11 +155,9 @@ extern PyTypeObject _PyExc_MemoryError;
         .ob_base = _PyObject_IMMORTAL_INIT(&PyUnicode_Type), \
         .length = sizeof(LITERAL) - 1, \
         .hash = -1, \
-        .state = { \
-            .kind = 1, \
-            .compact = 1, \
-            .ascii = (ASCII), \
-        }, \
+        .kind = 1, \
+        .compact = 1, \
+        .ascii = (ASCII), \
     }
 #define _PyASCIIObject_INIT(LITERAL) \
     { \

@@ -1555,5 +1555,11 @@ def func2(x=None):
         self.do_test(func2)
 
 
+class Test_UnicodeObjectAlignment(unittest.TestCase):
+
+    def test_unicodeobject_data_alignment(self):
+        _testinternalcapi.check_compactunicodeobject_data_alignment()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
@@ -684,6 +684,21 @@ clear_extension(PyObject *self, PyObject *args)
     Py_RETURN_NONE;
 }
 
+static PyObject *
+check_compactunicodeobject_data_alignment()
+{
+    size_t data_offset = sizeof(PyCompactUnicodeObject);
+    if (data_offset % 4 != 0) {
+        // This is required so that the data (which immediately follows a
+        // compact unicode offset) is correctly aligned in the largest case (UCS_4)
+        PyErr_Format(PyExc_AssertionError,
+                     "PyCompactUnicodeObject size offset is %i, needs to be multiple of 4 bytes",
+                     data_offset);
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
 
 static PyMethodDef module_functions[] = {
     {"get_configs", get_configs, METH_NOARGS},
@@ -707,6 +722,7 @@ static PyMethodDef module_functions[] = {
     _TESTINTERNALCAPI_OPTIMIZE_CFG_METHODDEF
     {"get_interp_settings", get_interp_settings, METH_VARARGS, NULL},
     {"clear_extension", clear_extension, METH_VARARGS, NULL},
+    {"check_compactunicodeobject_data_alignment", check_compactunicodeobject_data_alignment, METH_NOARGS, NULL},
     {NULL, NULL} /* sentinel */
 };
 

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -129,13 +129,16 @@ extern "C" {
 
 #define _PyUnicode_LENGTH(op)                           \
     (_PyASCIIObject_CAST(op)->length)
-#define _PyUnicode_STATE(op)                            \
-    (_PyASCIIObject_CAST(op)->state)
 #define _PyUnicode_HASH(op)                             \
     (_PyASCIIObject_CAST(op)->hash)
+#define _PyUnicode_INTERNED(op)                         \
+    (_PyASCIIObject_CAST(op)->interned)
 #define _PyUnicode_KIND(op)                             \
-    (assert(_PyUnicode_CHECK(op)),                      \
-     _PyASCIIObject_CAST(op)->state.kind)
+    (_PyASCIIObject_CAST(op)->kind)
+#define _PyUnicode_COMPACT(op)                          \
+    (_PyASCIIObject_CAST(op)->compact)
+#define _PyUnicode_ASCII(op)                            \
+    (_PyASCIIObject_CAST(op)->ascii)
 #define _PyUnicode_GET_LENGTH(op)                       \
     (assert(_PyUnicode_CHECK(op)),                      \
      _PyASCIIObject_CAST(op)->length)
@@ -497,21 +500,21 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
     CHECK(PyUnicode_Check(op));
 
     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
-    int kind = ascii->state.kind;
+    int kind = ascii->kind;
 
-    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
+    if (ascii->ascii == 1 && ascii->compact == 1) {
         CHECK(kind == PyUnicode_1BYTE_KIND);
     }
     else {
         PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
         void *data;
 
-        if (ascii->state.compact == 1) {
+        if (ascii->compact == 1) {
             data = compact + 1;
             CHECK(kind == PyUnicode_1BYTE_KIND
                                  || kind == PyUnicode_2BYTE_KIND
                                  || kind == PyUnicode_4BYTE_KIND);
-            CHECK(ascii->state.ascii == 0);
+            CHECK(ascii->ascii == 0);
             CHECK(compact->utf8 != data);
         }
         else {
@@ -521,9 +524,9 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
             CHECK(kind == PyUnicode_1BYTE_KIND
                      || kind == PyUnicode_2BYTE_KIND
                      || kind == PyUnicode_4BYTE_KIND);
-            CHECK(ascii->state.compact == 0);
+            CHECK(ascii->compact == 0);
             CHECK(data != NULL);
-            if (ascii->state.ascii) {
+            if (ascii->ascii) {
                 CHECK(compact->utf8 == data);
                 CHECK(compact->utf8_length == ascii->length);
             }
@@ -551,7 +554,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
                 maxchar = ch;
         }
         if (kind == PyUnicode_1BYTE_KIND) {
-            if (ascii->state.ascii == 0) {
+            if (ascii->ascii == 0) {
                 CHECK(maxchar >= 128);
                 CHECK(maxchar <= 255);
             }
@@ -1108,9 +1111,9 @@ _PyUnicode_Dump(PyObject *op)
     PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
     const void *data;
 
-    if (ascii->state.compact)
+    if (ascii->compact)
     {
-        if (ascii->state.ascii)
+        if (ascii->ascii)
             data = (ascii + 1);
         else
             data = (compact + 1);
@@ -1119,7 +1122,7 @@ _PyUnicode_Dump(PyObject *op)
         data = unicode->data.any;
     printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
 
-    if (!ascii->state.ascii) {
+    if (!ascii->ascii) {
         printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
     }
     printf(", data=%p\n", data);
@@ -1195,10 +1198,10 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
         data = unicode + 1;
     _PyUnicode_LENGTH(unicode) = size;
     _PyUnicode_HASH(unicode) = -1;
-    _PyUnicode_STATE(unicode).interned = 0;
-    _PyUnicode_STATE(unicode).kind = kind;
-    _PyUnicode_STATE(unicode).compact = 1;
-    _PyUnicode_STATE(unicode).ascii = is_ascii;
+    _PyUnicode_INTERNED(unicode) = 0;
+    _PyUnicode_KIND(unicode) = kind;
+    _PyUnicode_COMPACT(unicode) = 1;
+    _PyUnicode_ASCII(unicode) = is_ascii;
     if (is_ascii) {
         ((char*)data)[size] = 0;
     }
@@ -14372,10 +14375,10 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
 #else
     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
 #endif
-    _PyUnicode_STATE(self).interned = 0;
-    _PyUnicode_STATE(self).kind = kind;
-    _PyUnicode_STATE(self).compact = 0;
-    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
+    _PyUnicode_INTERNED(self) = 0;
+    _PyUnicode_KIND(self) = kind;
+    _PyUnicode_COMPACT(self) = 0;
+    _PyUnicode_ASCII(self) = _PyUnicode_ASCII(unicode);
     _PyUnicode_UTF8_LENGTH(self) = 0;
     _PyUnicode_UTF8(self) = NULL;
     _PyUnicode_DATA_ANY(self) = NULL;
@@ -14624,7 +14627,7 @@ PyUnicode_InternInPlace(PyObject **p)
        refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
        this. */
     Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
-    _PyUnicode_STATE(s).interned = 1;
+    _PyUnicode_INTERNED(s) = 1;
 }
 
 // Function kept for the stable ABI.
@@ -14683,7 +14686,7 @@ _PyUnicode_ClearInterned(PyInterpreterState *interp)
         total_length += PyUnicode_GET_LENGTH(s);
 #endif
 
-        _PyUnicode_STATE(s).interned = 0;
+        _PyUnicode_INTERNED(s) = 0;
     }
 #ifdef INTERNED_STATS
     fprintf(stderr,

diff --git a/Python/traceback.c b/Python/traceback.c
@@ -1092,9 +1092,9 @@ _Py_DumpASCII(int fd, PyObject *text)
         return;
 
     size = ascii->length;
-    kind = ascii->state.kind;
-    if (ascii->state.compact) {
-        if (ascii->state.ascii)
+    kind = ascii->kind;
+    if (ascii->compact) {
+        if (ascii->ascii)
             data = ascii + 1;
         else
             data = _PyCompactUnicodeObject_CAST(text) + 1;
@@ -1114,7 +1114,7 @@ _Py_DumpASCII(int fd, PyObject *text)
     }
 
     // Is an ASCII string?
-    if (ascii->state.ascii) {
+    if (ascii->ascii) {
         assert(kind == PyUnicode_1BYTE_KIND);
         char *str = data;
 
@@ -1341,4 +1341,3 @@ _Py_DumpTracebackThreads(int fd, PyInterpreterState *interp,
 
     return NULL;
 }
-
@@ -198,10 +198,9 @@ def generate_unicode(self, name: str, s: str) -> str:
                     self.object_head("PyUnicode_Type")
                     self.write(f".length = {len(s)},")
                     self.write(".hash = -1,")
-                    with self.block(".state =", ","):
-                        self.write(".kind = 1,")
-                        self.write(".compact = 1,")
-                        self.write(".ascii = 1,")
+                    self.write(".kind = 1,")
+                    self.write(".compact = 1,")
+                    self.write(".ascii = 1,")
                 self.write(f"._data = {make_string_literal(s.encode('ascii'))},")
                 return f"& {name}._ascii.ob_base"
             else:
@@ -210,10 +209,9 @@ def generate_unicode(self, name: str, s: str) -> str:
                         self.object_head("PyUnicode_Type")
                         self.write(f".length = {len(s)},")
                         self.write(".hash = -1,")
-                        with self.block(".state =", ","):
-                            self.write(f".kind = {kind},")
-                            self.write(".compact = 1,")
-                            self.write(".ascii = 0,")
+                        self.write(f".kind = {kind},")
+                        self.write(".compact = 1,")
+                        self.write(".ascii = 0,")
                     utf8 = s.encode('utf-8')
                     self.write(f'.utf8 = {make_string_literal(utf8)},')
                     self.write(f'.utf8_length = {len(utf8)},')