Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 40 additions & 51 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,46 +97,41 @@ typedef struct {
PyObject_HEAD
Py_ssize_t length; /* Number of code points in the string */
Py_hash_t hash; /* Hash value; -1 if not set */
struct {
/* If interned is set, the two references from the
dictionary to this object are *not* counted in ob_refcnt. */
unsigned int interned:1;
/* Character size:

- PyUnicode_1BYTE_KIND (1):

* character type = Py_UCS1 (8 bits, unsigned)
* all characters are in the range U+0000-U+00FF (latin1)
* if ascii is set, all characters are in the range U+0000-U+007F
(ASCII), otherwise at least one character is in the range
U+0080-U+00FF

- PyUnicode_2BYTE_KIND (2):

* character type = Py_UCS2 (16 bits, unsigned)
* all characters are in the range U+0000-U+FFFF (BMP)
* at least one character is in the range U+0100-U+FFFF

- PyUnicode_4BYTE_KIND (4):

* character type = Py_UCS4 (32 bits, unsigned)
* all characters are in the range U+0000-U+10FFFF
* at least one character is in the range U+10000-U+10FFFF
*/
unsigned int kind:3;
/* Compact is with respect to the allocation scheme. Compact unicode
objects only require one memory block while non-compact objects use
one block for the PyUnicodeObject struct and another for its data
buffer. */
unsigned int compact:1;
/* The string only contains characters in the range U+0000-U+007F (ASCII)
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */
unsigned int ascii:1;
/* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */
Comment on lines -136 to -137
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment on alignment seemed important, so I created a test in _testinternalcapi.c which I believe codifies this.

unsigned int :26;
} state;
/* If interned is set, the two references from the
dictionary to this object are *not* counted in ob_refcnt. */
uint8_t interned;
/* Character size:

- PyUnicode_1BYTE_KIND (1):

* character type = Py_UCS1 (8 bits, unsigned)
* all characters are in the range U+0000-U+00FF (latin1)
* if ascii is set, all characters are in the range U+0000-U+007F
(ASCII), otherwise at least one character is in the range
U+0080-U+00FF

- PyUnicode_2BYTE_KIND (2):

* character type = Py_UCS2 (16 bits, unsigned)
* all characters are in the range U+0000-U+FFFF (BMP)
* at least one character is in the range U+0100-U+FFFF

- PyUnicode_4BYTE_KIND (4):

* character type = Py_UCS4 (32 bits, unsigned)
* all characters are in the range U+0000-U+10FFFF
* at least one character is in the range U+10000-U+10FFFF
*/
uint8_t kind;
/* Compact is with respect to the allocation scheme. Compact unicode
objects only require one memory block while non-compact objects use
one block for the PyUnicodeObject struct and another for its data
buffer. */
uint8_t compact;
/* The string only contains characters in the range U+0000-U+007F (ASCII)
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */
uint8_t ascii;
} PyASCIIObject;

/* Non-ASCII strings allocated through PyUnicode_New use the
Expand Down Expand Up @@ -178,15 +173,9 @@ PyAPI_FUNC(int) _PyUnicode_CheckConsistency(

/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */

/* Values for PyASCIIObject.state: */

/* Interning state. */
#define SSTATE_NOT_INTERNED 0
#define SSTATE_INTERNED_MORTAL 1

/* Use only if you know it's a string */
static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
return _PyASCIIObject_CAST(op)->state.interned;
return _PyASCIIObject_CAST(op)->interned;
}
#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))

Expand All @@ -200,21 +189,21 @@ static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
ready. */
static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
return _PyASCIIObject_CAST(op)->state.ascii;
return _PyASCIIObject_CAST(op)->ascii;
}
#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))

/* Return true if the string is compact or 0 if not.
No type checks or Ready calls are performed. */
static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
return _PyASCIIObject_CAST(op)->state.compact;
return _PyASCIIObject_CAST(op)->compact;
}
#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))

/* Return true if the string is a compact ASCII string (use PyASCIIObject
structure), or 0 if not. No type checks or Ready calls are performed. */
static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
return (_PyASCIIObject_CAST(op)->ascii && PyUnicode_IS_COMPACT(op));
}
#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))

Expand All @@ -231,7 +220,7 @@ enum PyUnicode_Kind {
// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
// unsigned numbers) where kind type is an int or on
// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->kind)

/* Return a void pointer to the raw unicode buffer. */
static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
Expand Down
8 changes: 3 additions & 5 deletions Include/internal/pycore_runtime_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,9 @@ extern PyTypeObject _PyExc_MemoryError;
.ob_base = _PyObject_IMMORTAL_INIT(&PyUnicode_Type), \
.length = sizeof(LITERAL) - 1, \
.hash = -1, \
.state = { \
.kind = 1, \
.compact = 1, \
.ascii = (ASCII), \
}, \
.kind = 1, \
.compact = 1, \
.ascii = (ASCII), \
}
#define _PyASCIIObject_INIT(LITERAL) \
{ \
Expand Down
6 changes: 6 additions & 0 deletions Lib/test/test_capi/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1555,5 +1555,11 @@ def func2(x=None):
self.do_test(func2)


class Test_UnicodeObjectAlignment(unittest.TestCase):

def test_unicodeobject_data_alignment(self):
_testinternalcapi.check_compactunicodeobject_data_alignment()


if __name__ == "__main__":
unittest.main()
16 changes: 16 additions & 0 deletions Modules/_testinternalcapi.c
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,21 @@ clear_extension(PyObject *self, PyObject *args)
Py_RETURN_NONE;
}

static PyObject *
check_compactunicodeobject_data_alignment()
{
size_t data_offset = sizeof(PyCompactUnicodeObject);
if (data_offset % 4 != 0) {
// This is required so that the data (which immediately follows a
// compact unicode offset) is correctly aligned in the largest case (UCS_4)
PyErr_Format(PyExc_AssertionError,
"PyCompactUnicodeObject size offset is %i, needs to be multiple of 4 bytes",
data_offset);
return NULL;
}
Py_RETURN_NONE;
}


static PyMethodDef module_functions[] = {
{"get_configs", get_configs, METH_NOARGS},
Expand All @@ -707,6 +722,7 @@ static PyMethodDef module_functions[] = {
_TESTINTERNALCAPI_OPTIMIZE_CFG_METHODDEF
{"get_interp_settings", get_interp_settings, METH_VARARGS, NULL},
{"clear_extension", clear_extension, METH_VARARGS, NULL},
{"check_compactunicodeobject_data_alignment", check_compactunicodeobject_data_alignment, METH_NOARGS, NULL},
{NULL, NULL} /* sentinel */
};

Expand Down
51 changes: 27 additions & 24 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,16 @@ extern "C" {

#define _PyUnicode_LENGTH(op) \
(_PyASCIIObject_CAST(op)->length)
#define _PyUnicode_STATE(op) \
(_PyASCIIObject_CAST(op)->state)
#define _PyUnicode_HASH(op) \
(_PyASCIIObject_CAST(op)->hash)
#define _PyUnicode_INTERNED(op) \
(_PyASCIIObject_CAST(op)->interned)
#define _PyUnicode_KIND(op) \
(assert(_PyUnicode_CHECK(op)), \
_PyASCIIObject_CAST(op)->state.kind)
(_PyASCIIObject_CAST(op)->kind)
#define _PyUnicode_COMPACT(op) \
(_PyASCIIObject_CAST(op)->compact)
#define _PyUnicode_ASCII(op) \
(_PyASCIIObject_CAST(op)->ascii)
#define _PyUnicode_GET_LENGTH(op) \
(assert(_PyUnicode_CHECK(op)), \
_PyASCIIObject_CAST(op)->length)
Expand Down Expand Up @@ -497,21 +500,21 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
CHECK(PyUnicode_Check(op));

PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
int kind = ascii->state.kind;
int kind = ascii->kind;

if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
if (ascii->ascii == 1 && ascii->compact == 1) {
CHECK(kind == PyUnicode_1BYTE_KIND);
}
else {
PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
void *data;

if (ascii->state.compact == 1) {
if (ascii->compact == 1) {
data = compact + 1;
CHECK(kind == PyUnicode_1BYTE_KIND
|| kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND);
CHECK(ascii->state.ascii == 0);
CHECK(ascii->ascii == 0);
CHECK(compact->utf8 != data);
}
else {
Expand All @@ -521,9 +524,9 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
CHECK(kind == PyUnicode_1BYTE_KIND
|| kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND);
CHECK(ascii->state.compact == 0);
CHECK(ascii->compact == 0);
CHECK(data != NULL);
if (ascii->state.ascii) {
if (ascii->ascii) {
CHECK(compact->utf8 == data);
CHECK(compact->utf8_length == ascii->length);
}
Expand Down Expand Up @@ -551,7 +554,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
maxchar = ch;
}
if (kind == PyUnicode_1BYTE_KIND) {
if (ascii->state.ascii == 0) {
if (ascii->ascii == 0) {
CHECK(maxchar >= 128);
CHECK(maxchar <= 255);
}
Expand Down Expand Up @@ -1108,9 +1111,9 @@ _PyUnicode_Dump(PyObject *op)
PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
const void *data;

if (ascii->state.compact)
if (ascii->compact)
{
if (ascii->state.ascii)
if (ascii->ascii)
data = (ascii + 1);
else
data = (compact + 1);
Expand All @@ -1119,7 +1122,7 @@ _PyUnicode_Dump(PyObject *op)
data = unicode->data.any;
printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);

if (!ascii->state.ascii) {
if (!ascii->ascii) {
printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
}
printf(", data=%p\n", data);
Expand Down Expand Up @@ -1195,10 +1198,10 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
data = unicode + 1;
_PyUnicode_LENGTH(unicode) = size;
_PyUnicode_HASH(unicode) = -1;
_PyUnicode_STATE(unicode).interned = 0;
_PyUnicode_STATE(unicode).kind = kind;
_PyUnicode_STATE(unicode).compact = 1;
_PyUnicode_STATE(unicode).ascii = is_ascii;
_PyUnicode_INTERNED(unicode) = 0;
_PyUnicode_KIND(unicode) = kind;
_PyUnicode_COMPACT(unicode) = 1;
_PyUnicode_ASCII(unicode) = is_ascii;
if (is_ascii) {
((char*)data)[size] = 0;
}
Expand Down Expand Up @@ -14372,10 +14375,10 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
#else
_PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
#endif
_PyUnicode_STATE(self).interned = 0;
_PyUnicode_STATE(self).kind = kind;
_PyUnicode_STATE(self).compact = 0;
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
_PyUnicode_INTERNED(self) = 0;
_PyUnicode_KIND(self) = kind;
_PyUnicode_COMPACT(self) = 0;
_PyUnicode_ASCII(self) = _PyUnicode_ASCII(unicode);
_PyUnicode_UTF8_LENGTH(self) = 0;
_PyUnicode_UTF8(self) = NULL;
_PyUnicode_DATA_ANY(self) = NULL;
Expand Down Expand Up @@ -14624,7 +14627,7 @@ PyUnicode_InternInPlace(PyObject **p)
refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
this. */
Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
_PyUnicode_STATE(s).interned = 1;
_PyUnicode_INTERNED(s) = 1;
}

// Function kept for the stable ABI.
Expand Down Expand Up @@ -14683,7 +14686,7 @@ _PyUnicode_ClearInterned(PyInterpreterState *interp)
total_length += PyUnicode_GET_LENGTH(s);
#endif

_PyUnicode_STATE(s).interned = 0;
_PyUnicode_INTERNED(s) = 0;
}
#ifdef INTERNED_STATS
fprintf(stderr,
Expand Down
9 changes: 4 additions & 5 deletions Python/traceback.c
Original file line number Diff line number Diff line change
Expand Up @@ -1092,9 +1092,9 @@ _Py_DumpASCII(int fd, PyObject *text)
return;

size = ascii->length;
kind = ascii->state.kind;
if (ascii->state.compact) {
if (ascii->state.ascii)
kind = ascii->kind;
if (ascii->compact) {
if (ascii->ascii)
data = ascii + 1;
else
data = _PyCompactUnicodeObject_CAST(text) + 1;
Expand All @@ -1114,7 +1114,7 @@ _Py_DumpASCII(int fd, PyObject *text)
}

// Is an ASCII string?
if (ascii->state.ascii) {
if (ascii->ascii) {
assert(kind == PyUnicode_1BYTE_KIND);
char *str = data;

Expand Down Expand Up @@ -1341,4 +1341,3 @@ _Py_DumpTracebackThreads(int fd, PyInterpreterState *interp,

return NULL;
}

14 changes: 6 additions & 8 deletions Tools/build/deepfreeze.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,9 @@ def generate_unicode(self, name: str, s: str) -> str:
self.object_head("PyUnicode_Type")
self.write(f".length = {len(s)},")
self.write(".hash = -1,")
with self.block(".state =", ","):
self.write(".kind = 1,")
self.write(".compact = 1,")
self.write(".ascii = 1,")
self.write(".kind = 1,")
self.write(".compact = 1,")
self.write(".ascii = 1,")
self.write(f"._data = {make_string_literal(s.encode('ascii'))},")
return f"& {name}._ascii.ob_base"
else:
Expand All @@ -210,10 +209,9 @@ def generate_unicode(self, name: str, s: str) -> str:
self.object_head("PyUnicode_Type")
self.write(f".length = {len(s)},")
self.write(".hash = -1,")
with self.block(".state =", ","):
self.write(f".kind = {kind},")
self.write(".compact = 1,")
self.write(".ascii = 0,")
self.write(f".kind = {kind},")
self.write(".compact = 1,")
self.write(".ascii = 0,")
utf8 = s.encode('utf-8')
self.write(f'.utf8 = {make_string_literal(utf8)},')
self.write(f'.utf8_length = {len(utf8)},')
Expand Down
Loading