diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 440410d0aef17d..9e53fab842909a 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -257,16 +257,14 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end, /* UTF-8 encoder specialized for a Unicode kind to avoid the slow PyUnicode_READ() macro. Delete some parts of the code depending on the kind: UCS-1 strings don't need to handle surrogates for example. */ -Py_LOCAL_INLINE(char *) -STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, - PyObject *unicode, +Py_LOCAL_INLINE(PyBytesWriter*) +STRINGLIB(utf8_encoder)(PyObject *unicode, const STRINGLIB_CHAR *data, Py_ssize_t size, _Py_error_handler error_handler, - const char *errors) + const char *errors, + char **end) { - Py_ssize_t i; /* index into data of next input character */ - char *p; /* next free byte in output buffer */ #if STRINGLIB_SIZEOF_CHAR > 1 PyObject *error_handler_obj = NULL; PyObject *exc = NULL; @@ -284,14 +282,19 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, if (size > PY_SSIZE_T_MAX / max_char_size) { /* integer overflow */ PyErr_NoMemory(); + *end = NULL; return NULL; } - _PyBytesWriter_Init(writer); - p = _PyBytesWriter_Alloc(writer, size * max_char_size); - if (p == NULL) + PyBytesWriter *writer = PyBytesWriter_Create(size * max_char_size); + if (writer == NULL) { + *end = NULL; return NULL; + } + /* next free byte in output buffer */ + char *p = PyBytesWriter_GetData(writer); + Py_ssize_t i; /* index into data of next input character */ for (i = 0; i < size;) { Py_UCS4 ch = data[i++]; @@ -348,7 +351,7 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, case _Py_ERROR_BACKSLASHREPLACE: /* subtract preallocated bytes */ - writer->min_size -= max_char_size * (endpos - startpos); + writer->size -= max_char_size * (endpos - startpos); p = backslashreplace(writer, p, unicode, startpos, endpos); if (p == NULL) @@ -358,7 +361,7 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, case _Py_ERROR_XMLCHARREFREPLACE: /* subtract preallocated bytes */ - writer->min_size -= max_char_size * (endpos - startpos); + writer->size -= max_char_size * (endpos - startpos); p = xmlcharrefreplace(writer, p, unicode, startpos, endpos); if (p == NULL) @@ -389,22 +392,25 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, if (newpos < startpos) { writer->overallocate = 1; - p = _PyBytesWriter_Prepare(writer, p, - max_char_size * (startpos - newpos)); - if (p == NULL) + p = PyBytesWriter_GrowAndUpdatePointer(writer, + max_char_size * (startpos - newpos), + p); + if (p == NULL) { goto error; + } } else { /* subtract preallocated bytes */ - writer->min_size -= max_char_size * (newpos - startpos); + writer->size -= max_char_size * (newpos - startpos); /* Only overallocate the buffer if it's not the last write */ writer->overallocate = (newpos < size); } + char *rep_str; + Py_ssize_t rep_len; if (PyBytes_Check(rep)) { - p = _PyBytesWriter_WriteBytes(writer, p, - PyBytes_AS_STRING(rep), - PyBytes_GET_SIZE(rep)); + rep_str = PyBytes_AS_STRING(rep); + rep_len = PyBytes_GET_SIZE(rep); } else { /* rep is unicode */ @@ -415,13 +421,16 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, goto error; } - p = _PyBytesWriter_WriteBytes(writer, p, - PyUnicode_DATA(rep), - PyUnicode_GET_LENGTH(rep)); + rep_str = PyUnicode_DATA(rep); + rep_len = PyUnicode_GET_LENGTH(rep); } - if (p == NULL) + p = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, p); + if (p == NULL) { goto error; + } + memcpy(p, rep_str, rep_len); + p += rep_len; Py_CLEAR(rep); i = newpos; @@ -458,13 +467,16 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, Py_XDECREF(error_handler_obj); Py_XDECREF(exc); #endif - return p; + *end = p; + return writer; #if STRINGLIB_SIZEOF_CHAR > 1 error: + PyBytesWriter_Discard(writer); Py_XDECREF(rep); Py_XDECREF(error_handler_obj); Py_XDECREF(exc); + *end = NULL; return NULL; #endif } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c8d2c68615e13e..ab0f00e6be0b38 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -828,7 +828,7 @@ unicode_result_unchanged(PyObject *unicode) /* Implementation of the "backslashreplace" error handler for 8-bit encodings: ASCII, Latin1, UTF-8, etc. */ static char* -backslashreplace(_PyBytesWriter *writer, char *str, +backslashreplace(PyBytesWriter *writer, char *str, PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) { Py_ssize_t size, i; @@ -861,9 +861,10 @@ backslashreplace(_PyBytesWriter *writer, char *str, size += incr; } - str = _PyBytesWriter_Prepare(writer, str, size); - if (str == NULL) + str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str); + if (str == NULL) { return NULL; + } /* generate replacement */ for (i = collstart; i < collend; ++i) { @@ -894,7 +895,7 @@ backslashreplace(_PyBytesWriter *writer, char *str, /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: ASCII, Latin1, UTF-8, etc. */ static char* -xmlcharrefreplace(_PyBytesWriter *writer, char *str, +xmlcharrefreplace(PyBytesWriter *writer, char *str, PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) { Py_ssize_t size, i; @@ -935,9 +936,10 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str, size += incr; } - str = _PyBytesWriter_Prepare(writer, str, size); - if (str == NULL) + str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str); + if (str == NULL) { return NULL; + } /* generate replacement */ for (i = collstart; i < collend; ++i) { @@ -5836,7 +5838,7 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, const void *data = PyUnicode_DATA(unicode); Py_ssize_t size = PyUnicode_GET_LENGTH(unicode); - _PyBytesWriter writer; + PyBytesWriter *writer; char *end; switch (kind) { @@ -5845,21 +5847,24 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, case PyUnicode_1BYTE_KIND: /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ assert(!PyUnicode_IS_ASCII(unicode)); - end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); + writer = ucs1lib_utf8_encoder(unicode, data, size, + error_handler, errors, &end); break; case PyUnicode_2BYTE_KIND: - end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); + writer = ucs2lib_utf8_encoder(unicode, data, size, + error_handler, errors, &end); break; case PyUnicode_4BYTE_KIND: - end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); + writer = ucs4lib_utf8_encoder(unicode, data, size, + error_handler, errors, &end); break; } - if (end == NULL) { - _PyBytesWriter_Dealloc(&writer); + if (writer == NULL) { + PyBytesWriter_Discard(writer); return NULL; } - return _PyBytesWriter_Finish(&writer, end); + return PyBytesWriter_FinishWithPointer(writer, end); } static int @@ -5873,37 +5878,35 @@ unicode_fill_utf8(PyObject *unicode) const void *data = PyUnicode_DATA(unicode); Py_ssize_t size = PyUnicode_GET_LENGTH(unicode); - _PyBytesWriter writer; + PyBytesWriter *writer; char *end; switch (kind) { default: Py_UNREACHABLE(); case PyUnicode_1BYTE_KIND: - end = ucs1lib_utf8_encoder(&writer, unicode, data, size, - _Py_ERROR_STRICT, NULL); + writer = ucs1lib_utf8_encoder(unicode, data, size, + _Py_ERROR_STRICT, NULL, &end); break; case PyUnicode_2BYTE_KIND: - end = ucs2lib_utf8_encoder(&writer, unicode, data, size, - _Py_ERROR_STRICT, NULL); + writer = ucs2lib_utf8_encoder(unicode, data, size, + _Py_ERROR_STRICT, NULL, &end); break; case PyUnicode_4BYTE_KIND: - end = ucs4lib_utf8_encoder(&writer, unicode, data, size, - _Py_ERROR_STRICT, NULL); + writer = ucs4lib_utf8_encoder(unicode, data, size, + _Py_ERROR_STRICT, NULL, &end); break; } - if (end == NULL) { - _PyBytesWriter_Dealloc(&writer); + if (writer == NULL) { return -1; } - const char *start = writer.use_small_buffer ? writer.small_buffer : - PyBytes_AS_STRING(writer.buffer); + const char *start = PyBytesWriter_GetData(writer); Py_ssize_t len = end - start; char *cache = PyMem_Malloc(len + 1); if (cache == NULL) { - _PyBytesWriter_Dealloc(&writer); + PyBytesWriter_Discard(writer); PyErr_NoMemory(); return -1; } @@ -5911,7 +5914,7 @@ unicode_fill_utf8(PyObject *unicode) cache[len] = '\0'; PyUnicode_SET_UTF8_LENGTH(unicode, len); PyUnicode_SET_UTF8(unicode, cache); - _PyBytesWriter_Dealloc(&writer); + PyBytesWriter_Discard(writer); return 0; } @@ -7347,16 +7350,12 @@ unicode_encode_ucs1(PyObject *unicode, Py_ssize_t pos=0, size; int kind; const void *data; - /* pointer into the output */ - char *str; const char *encoding = (limit == 256) ? "latin-1" : "ascii"; const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; PyObject *error_handler_obj = NULL; PyObject *exc = NULL; _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; PyObject *rep = NULL; - /* output object */ - _PyBytesWriter writer; size = PyUnicode_GET_LENGTH(unicode); kind = PyUnicode_KIND(unicode); @@ -7366,10 +7365,13 @@ unicode_encode_ucs1(PyObject *unicode, if (size == 0) return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES); - _PyBytesWriter_Init(&writer); - str = _PyBytesWriter_Alloc(&writer, size); - if (str == NULL) + /* output object */ + PyBytesWriter *writer = PyBytesWriter_Create(size); + if (writer == NULL) { return NULL; + } + /* pointer into the output */ + char *str = PyBytesWriter_GetData(writer); while (pos < size) { Py_UCS4 ch = PyUnicode_READ(kind, data, pos); @@ -7391,7 +7393,7 @@ unicode_encode_ucs1(PyObject *unicode, ++collend; /* Only overallocate the buffer if it's not the last write */ - writer.overallocate = (collend < size); + writer->overallocate = (collend < size); /* cache callback name lookup (if not done yet, i.e. it's the first error) */ if (error_handler == _Py_ERROR_UNKNOWN) @@ -7412,8 +7414,8 @@ unicode_encode_ucs1(PyObject *unicode, case _Py_ERROR_BACKSLASHREPLACE: /* subtract preallocated bytes */ - writer.min_size -= (collend - collstart); - str = backslashreplace(&writer, str, + writer->size -= (collend - collstart); + str = backslashreplace(writer, str, unicode, collstart, collend); if (str == NULL) goto onError; @@ -7422,8 +7424,8 @@ unicode_encode_ucs1(PyObject *unicode, case _Py_ERROR_XMLCHARREFREPLACE: /* subtract preallocated bytes */ - writer.min_size -= (collend - collstart); - str = xmlcharrefreplace(&writer, str, + writer->size -= (collend - collstart); + str = xmlcharrefreplace(writer, str, unicode, collstart, collend); if (str == NULL) goto onError; @@ -7454,24 +7456,27 @@ unicode_encode_ucs1(PyObject *unicode, goto onError; if (newpos < collstart) { - writer.overallocate = 1; - str = _PyBytesWriter_Prepare(&writer, str, - collstart - newpos); - if (str == NULL) + writer->overallocate = 1; + str = PyBytesWriter_GrowAndUpdatePointer(writer, + collstart - newpos, + str); + if (str == NULL) { goto onError; + } } else { /* subtract preallocated bytes */ - writer.min_size -= newpos - collstart; + writer->size -= newpos - collstart; /* Only overallocate the buffer if it's not the last write */ - writer.overallocate = (newpos < size); + writer->overallocate = (newpos < size); } + char *rep_str; + Py_ssize_t rep_len; if (PyBytes_Check(rep)) { /* Directly copy bytes result to output. */ - str = _PyBytesWriter_WriteBytes(&writer, str, - PyBytes_AS_STRING(rep), - PyBytes_GET_SIZE(rep)); + rep_str = PyBytes_AS_STRING(rep); + rep_len = PyBytes_GET_SIZE(rep); } else { assert(PyUnicode_Check(rep)); @@ -7486,12 +7491,16 @@ unicode_encode_ucs1(PyObject *unicode, goto onError; } assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); - str = _PyBytesWriter_WriteBytes(&writer, str, - PyUnicode_DATA(rep), - PyUnicode_GET_LENGTH(rep)); + rep_str = PyUnicode_DATA(rep); + rep_len = PyUnicode_GET_LENGTH(rep); } - if (str == NULL) + + str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str); + if (str == NULL) { goto onError; + } + memcpy(str, rep_str, rep_len); + str += rep_len; pos = newpos; Py_CLEAR(rep); @@ -7499,17 +7508,17 @@ unicode_encode_ucs1(PyObject *unicode, /* If overallocation was disabled, ensure that it was the last write. Otherwise, we missed an optimization */ - assert(writer.overallocate || pos == size); + assert(writer->overallocate || pos == size); } } Py_XDECREF(error_handler_obj); Py_XDECREF(exc); - return _PyBytesWriter_Finish(&writer, str); + return PyBytesWriter_FinishWithPointer(writer, str); onError: Py_XDECREF(rep); - _PyBytesWriter_Dealloc(&writer); + PyBytesWriter_Discard(writer); Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return NULL;