@@ -47,6 +47,63 @@ static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
4747
4848/* --- Unicode Type ------------------------------------------------------- */
4949
50+ struct _PyUnicodeObject_state {
51+ /* If interned is non-zero, the two references from the
52+ dictionary to this object are *not* counted in ob_refcnt.
53+ The possible values here are:
54+ 0: Not Interned
55+ 1: Interned
56+ 2: Interned and Immortal
57+ 3: Interned, Immortal, and Static
58+ This categorization allows the runtime to determine the right
59+ cleanup mechanism at runtime shutdown. */
60+ #ifdef Py_GIL_DISABLED
61+ // Needs to be accessed atomically, so can't be a bit field.
62+ unsigned char interned ;
63+ #else
64+ unsigned int interned :2 ;
65+ #endif
66+ /* Character size:
67+
68+ - PyUnicode_1BYTE_KIND (1):
69+
70+ * character type = Py_UCS1 (8 bits, unsigned)
71+ * all characters are in the range U+0000-U+00FF (latin1)
72+ * if ascii is set, all characters are in the range U+0000-U+007F
73+ (ASCII), otherwise at least one character is in the range
74+ U+0080-U+00FF
75+
76+ - PyUnicode_2BYTE_KIND (2):
77+
78+ * character type = Py_UCS2 (16 bits, unsigned)
79+ * all characters are in the range U+0000-U+FFFF (BMP)
80+ * at least one character is in the range U+0100-U+FFFF
81+
82+ - PyUnicode_4BYTE_KIND (4):
83+
84+ * character type = Py_UCS4 (32 bits, unsigned)
85+ * all characters are in the range U+0000-U+10FFFF
86+ * at least one character is in the range U+10000-U+10FFFF
87+ */
88+ unsigned int kind :3 ;
89+ /* Compact is with respect to the allocation scheme. Compact unicode
90+ objects only require one memory block while non-compact objects use
91+ one block for the PyUnicodeObject struct and another for its data
92+ buffer. */
93+ unsigned int compact :1 ;
94+ /* The string only contains characters in the range U+0000-U+007F (ASCII)
95+ and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
96+ set, use the PyASCIIObject structure. */
97+ unsigned int ascii :1 ;
98+ /* The object is statically allocated. */
99+ unsigned int statically_allocated :1 ;
100+ #ifndef Py_GIL_DISABLED
101+ /* Historical: padding to ensure that PyUnicode_DATA() is always aligned to
102+ 4 bytes (see issue gh-63736 on m68k) */
103+ unsigned int :24 ;
104+ #endif
105+ };
106+
50107/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
51108 structure. state.ascii and state.compact are set, and the data
52109 immediately follow the structure. utf8_length can be found
@@ -99,67 +156,8 @@ typedef struct {
99156 PyObject_HEAD
100157 Py_ssize_t length ; /* Number of code points in the string */
101158 Py_hash_t hash ; /* Hash value; -1 if not set */
102- #ifdef Py_GIL_DISABLED
103- /* Ensure 4 byte alignment for PyUnicode_DATA(), see gh-63736 on m68k.
104- In the non-free-threaded build, we'll use explicit padding instead */
105- _Py_ALIGN_AS (4 )
106- #endif
107- struct {
108- /* If interned is non-zero, the two references from the
109- dictionary to this object are *not* counted in ob_refcnt.
110- The possible values here are:
111- 0: Not Interned
112- 1: Interned
113- 2: Interned and Immortal
114- 3: Interned, Immortal, and Static
115- This categorization allows the runtime to determine the right
116- cleanup mechanism at runtime shutdown. */
117- #ifdef Py_GIL_DISABLED
118- // Needs to be accessed atomically, so can't be a bit field.
119- unsigned char interned ;
120- #else
121- unsigned int interned :2 ;
122- #endif
123- /* Character size:
124-
125- - PyUnicode_1BYTE_KIND (1):
126-
127- * character type = Py_UCS1 (8 bits, unsigned)
128- * all characters are in the range U+0000-U+00FF (latin1)
129- * if ascii is set, all characters are in the range U+0000-U+007F
130- (ASCII), otherwise at least one character is in the range
131- U+0080-U+00FF
132-
133- - PyUnicode_2BYTE_KIND (2):
134-
135- * character type = Py_UCS2 (16 bits, unsigned)
136- * all characters are in the range U+0000-U+FFFF (BMP)
137- * at least one character is in the range U+0100-U+FFFF
138-
139- - PyUnicode_4BYTE_KIND (4):
140-
141- * character type = Py_UCS4 (32 bits, unsigned)
142- * all characters are in the range U+0000-U+10FFFF
143- * at least one character is in the range U+10000-U+10FFFF
144- */
145- unsigned int kind :3 ;
146- /* Compact is with respect to the allocation scheme. Compact unicode
147- objects only require one memory block while non-compact objects use
148- one block for the PyUnicodeObject struct and another for its data
149- buffer. */
150- unsigned int compact :1 ;
151- /* The string only contains characters in the range U+0000-U+007F (ASCII)
152- and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
153- set, use the PyASCIIObject structure. */
154- unsigned int ascii :1 ;
155- /* The object is statically allocated. */
156- unsigned int statically_allocated :1 ;
157- #ifndef Py_GIL_DISABLED
158- /* Padding to ensure that PyUnicode_DATA() is always aligned to
159- 4 bytes (see issue gh-63736 on m68k) */
160- unsigned int :24 ;
161- #endif
162- } state ;
159+ /* Ensure 4 byte alignment for PyUnicode_DATA(), see gh-63736 on m68k. */
160+ _Py_ALIGNED_DEF (4 , struct _PyUnicodeObject_state ) state ;
163161} PyASCIIObject ;
164162
165163/* Non-ASCII strings allocated through PyUnicode_New use the
0 commit comments