Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Optimize decoding of short UTF-8 sequences containing non-ASCII characters
by approximately 1.5x.
255 changes: 241 additions & 14 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,45 @@
}
#endif

// Simplified version of PyUnicode_New() that only creates ASCII strings.
// This function does not test if size == 0.
static PyObject *
ascii_new(Py_ssize_t size)
{
PyObject *obj;
void *data;
Py_ssize_t struct_size = sizeof(PyASCIIObject);

if (size > ((PY_SSIZE_T_MAX - struct_size) - 1))
return PyErr_NoMemory();

/* Duplicated allocation code from _PyObject_New() instead of a call to
* PyObject_New() so we are able to allocate space for the object and
* it's data buffer.
*/
obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1));
if (obj == NULL) {
return PyErr_NoMemory();
}
_PyObject_Init(obj, &PyUnicode_Type);

data = ((PyASCIIObject*)obj) + 1;

_PyUnicode_LENGTH(obj) = size;
_PyUnicode_HASH(obj) = -1;
_PyUnicode_STATE(obj).interned = 0;
_PyUnicode_STATE(obj).kind = PyUnicode_1BYTE_KIND;
_PyUnicode_STATE(obj).compact = 1;
_PyUnicode_STATE(obj).ascii = 1;
_PyUnicode_STATE(obj).statically_allocated = 0;
((char*)data)[size] = 0;

#ifdef Py_DEBUG
unicode_fill_invalid((PyObject*)unicode, 0);

Check failure on line 1342 in Objects/unicodeobject.c

View workflow job for this annotation

GitHub Actions / Hypothesis tests on Ubuntu

‘unicode’ undeclared (first use in this function); did you mean ‘unicode_eq’?

Check failure on line 1342 in Objects/unicodeobject.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / build (arm64)

'unicode': undeclared identifier [D:\a\cpython\cpython\PCbuild\_freeze_module.vcxproj]

Check failure on line 1342 in Objects/unicodeobject.c

View workflow job for this annotation

GitHub Actions / Windows / build and test (x64)

'unicode': undeclared identifier [D:\a\cpython\cpython\PCbuild\_freeze_module.vcxproj]

Check failure on line 1342 in Objects/unicodeobject.c

View workflow job for this annotation

GitHub Actions / Ubuntu / build and test (ubuntu-22.04)

‘unicode’ undeclared (first use in this function); did you mean ‘unicode_eq’?

Check failure on line 1342 in Objects/unicodeobject.c

View workflow job for this annotation

GitHub Actions / Ubuntu (free-threading) / build and test (ubuntu-22.04)

‘unicode’ undeclared (first use in this function); did you mean ‘unicode_eq’?
#endif
assert(_PyUnicode_CheckConsistency(obj, 0));
return obj;
}

PyObject *
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
Expand Down Expand Up @@ -2208,13 +2247,16 @@
{
const unsigned char *s = (const unsigned char *)buffer;
PyObject *unicode;
if (size == 0) {
return unicode_get_empty();
}
if (size == 1) {
#ifdef Py_DEBUG
assert((unsigned char)s[0] < 128);
#endif
return get_latin1_char(s[0]);
}
unicode = PyUnicode_New(size, 127);
unicode = ascii_new(size);
if (!unicode)
return NULL;
memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
Expand Down Expand Up @@ -4978,12 +5020,17 @@
#include "stringlib/codecs.h"
#include "stringlib/undef.h"

#if (SIZEOF_SIZE_T == 8)
/* Mask to quickly check whether a C 'size_t' contains a
non-ASCII, UTF8-encoded char. */
#if (SIZEOF_SIZE_T == 8)
# define ASCII_CHAR_MASK 0x8080808080808080ULL
// used to count codepoints in UTF-8 string.
# define VECTOR_0101 0x0101010101010101ULL
# define VECTOR_00FF 0x00ff00ff00ff00ffULL
#elif (SIZEOF_SIZE_T == 4)
# define ASCII_CHAR_MASK 0x80808080U
# define VECTOR_0101 0x01010101U
# define VECTOR_00FF 0x00ff00ffU
#else
# error C 'size_t' size should be either 4 or 8!
#endif
Expand Down Expand Up @@ -5043,6 +5090,145 @@
return p - start;
}

#if (defined(__clang__) || defined(__GNUC__))
#define HAS_CTZ 1
static inline unsigned int ctz(size_t v) {
return __builtin_ctzll((unsigned long long)v);
}
#elif defined(_MSC_VER)
#define HAS_CTZ 1
static inline unsigned int ctz(size_t v) {
unsigned long pos;
#if SIZEOF_SIZE_T == 4
_BitScanForward(&pos, v);
#else
_BitScanForward64(&pos, v);
#endif /* SIZEOF_SIZE_T */
return pos;
}
#endif

static Py_ssize_t
find_first_nonascii(const unsigned char *start, const unsigned char *end)
{
const unsigned char *p = start;

if (end - start > SIZEOF_SIZE_T + ALIGNOF_SIZE_T) {
while (!_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
if ((unsigned char)*p & 0x80) {
return p - start;
}
p++;
}
const unsigned char *e = end - SIZEOF_SIZE_T;
while (p <= e) {
size_t value = (*(const size_t *)p) & ASCII_CHAR_MASK;
if (value) {
#if PY_LITTLE_ENDIAN && HAS_CTZ
return p - start + (ctz(value) - 7) / 8;
#else
// big endian and minor compilers are difficult to test.
// fallback to per byte check.
break;
#endif
}
p += SIZEOF_SIZE_T;
}
}
#if HAS_CTZ
// This part looks bit tricky, but decoding short ASCII is super important.
// Since we copy from p to size_t manually, this part works fine with big endian.
while (p < end) {
size_t u = (size_t)(p[0]);
switch (end - p) {
default:
#if SIZEOF_SIZE_T == 8
u |= (size_t)(p[7]) << 56ull;
// fall through
case 7:
u |= (size_t)(p[6]) << 48ull;
// fall through
case 6:
u |= (size_t)(p[5]) << 40ull;
// fall through
case 5:
u |= (size_t)(p[4]) << 32ull;
// fall through
case 4:
#endif
u |= (size_t)(p[3]) << 24;
// fall through
case 3:
u |= (size_t)(p[2]) << 16;
// fall through
case 2:
u |= (size_t)(p[1]) << 8;
break;
case 1:
break;
}
if (u & ASCII_CHAR_MASK) {
return p - start + (ctz(u & ASCII_CHAR_MASK) - 7) / 8;
}
p += SIZEOF_SIZE_T;
}
return end - start;
#else
while (p < end) {
if ((unsigned char)*p & 0x80) {
break;
}
p++;
}
return p - start;
#endif
}

static inline int scalar_utf8_start_char(unsigned int ch)
{
// 0xxxxxxx or 11xxxxxx are first byte.
return (~ch >> 7 | ch >> 6) & 1;
}

static inline size_t vector_utf8_start_chars(size_t v)
{
return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
}

static Py_ssize_t utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
{
Py_ssize_t len = 0;

if (end - s > SIZEOF_SIZE_T + ALIGNOF_SIZE_T) {
while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
len += scalar_utf8_start_char(*s++);
}

while (s + SIZEOF_SIZE_T <= end) {
const unsigned char *e = end;
if (e - s > SIZEOF_SIZE_T * 255) {
e = s + SIZEOF_SIZE_T * 255;
}
Py_ssize_t vstart = 0;
while (s + SIZEOF_SIZE_T <= e) {
size_t v = *(size_t*)s;
size_t vs = vector_utf8_start_chars(v);
vstart += vs;
s += SIZEOF_SIZE_T;
}
vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
vstart += vstart >> 16;
#if SIZEOF_SIZE_T == 8
vstart += vstart >> 32;
#endif
len += vstart & 0x7ff;
}
}
while (s < end) {
len += scalar_utf8_start_char(*s++);
}
return len;
}

static int
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
Expand Down Expand Up @@ -5187,27 +5373,66 @@
return get_latin1_char((unsigned char)s[0]);
}

// fast path: try ASCII string.
const char *starts = s;
const char *end = s + size;
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
// I don't know this check is necessary or not. But there is a test
// case that requires size=PY_SSIZE_T_MAX cause MemoryError.
if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
PyErr_NoMemory();
return NULL;
}
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
if (decoded == size) {

const char *starts = s;
const char *end = s + size;

Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
if (pos == size) { // fast path: ASCII string.
PyObject *u = ascii_new(size);
if (u == NULL) {
return NULL;
}
// memcpy(PyUnicode_1BYTE_DATA(u), s, size);
// bypass iscompact & isascii checks.
memcpy(_Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(u) + 1)), s, size);
if (consumed) {
*consumed = size;
}
return u;
}
s += decoded;
size -= decoded;

int maxchr = 127;
Py_ssize_t maxsize = size;

unsigned char ch = (unsigned char)(s[pos]);
// error handler other than strict may remove/replace the invalid byte.
// consumed != NULL allows 1~3 bytes remainings.
// 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
// otherwise: check the input and decide the maxchr and maxsize to reduce
// reallocation and copy.
if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
if (ch < 0xc4) { // latin1
maxchr = 255;
}
else if (ch < 0xf0) { // ucs2
maxchr = 65535;
}
else { // ucs4
maxchr = 0x10ffff;
}
}
PyObject *u = PyUnicode_New(maxsize, maxchr);
if (!u) {
return NULL;
}

// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = decoded;
if (maxchr <= 255) {
memcpy(_PyUnicode_COMPACT_DATA(u), s, pos);
s += pos;
size -= pos;
writer.pos = pos;
}

if (unicode_decode_utf8_impl(&writer, starts, s, end,
error_handler, errors,
Expand Down Expand Up @@ -5267,7 +5492,9 @@
const char *errors,
Py_ssize_t *consumed)
{
return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
return unicode_decode_utf8(s, size,
errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
errors, consumed);
}


Expand Down Expand Up @@ -7282,7 +7509,7 @@
}

// Shortcut for simple case
PyObject *u = PyUnicode_New(size, 127);
PyObject *u = ascii_new(size);
if (u == NULL) {
return NULL;
}
Expand Down
Loading