aio-libs · Vizonex · Jun 10, 2025 · Jun 10, 2025 · Jun 10, 2025 · Jun 10, 2025
diff --git a/CHANGES/1529.feature.rst b/CHANGES/1529.feature.rst
@@ -0,0 +1 @@
+Optimized Unquote class with the use of a new Unicode writer based off the quoting writer -- by :user:`Vizonex`.
diff --git a/yarl/_quoting_c.pyx b/yarl/_quoting_c.pyx
@@ -3,9 +3,11 @@
 from cpython.exc cimport PyErr_NoMemory
 from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc
 from cpython.unicode cimport (
+    PyUnicode_4BYTE_KIND,
     PyUnicode_DATA,
     PyUnicode_DecodeASCII,
     PyUnicode_DecodeUTF8Stateful,
+    PyUnicode_FromKindAndData,
     PyUnicode_GET_LENGTH,
     PyUnicode_KIND,
     PyUnicode_READ,
@@ -311,6 +313,73 @@ cdef class _Quoter:
         return _write_utf8(writer, ch)
 
 
+# Custom Writer for dealing with unicode characters so that lists aren't required when
+# Unquoting...
+# Python's C API can't do dynamic Unicode allocating so this was the closest solution...
+
+# ----------------- Unicode Writer ---------------------------
+
+
+cdef struct UnicodeWriter:
+    Py_UCS4 *buf
+    int kind
+    Py_ssize_t index
+    Py_ssize_t size
+
+cdef inline int _unicode_writer_init(
+    UnicodeWriter* writer, Py_ssize_t size, int kind
+) except -1:
+    writer.buf = <Py_UCS4*>PyMem_Malloc(sizeof(Py_UCS4) * size)
+    if writer.buf == NULL:
+        PyErr_NoMemory()
+        return -1
+    writer.kind = kind
+    writer.index = 0
+    writer.size = size
+    return 0
+
+cdef inline int _unicode_writer__write_char(
+    UnicodeWriter* writer, Py_UCS4 ch
+) except -1:
+    cdef Py_UCS4* alloc
+    cdef Py_ssize_t size
+    if writer.index >= writer.size:
+        size = writer.size + BUF_SIZE
+
+        alloc = <Py_UCS4*>PyMem_Realloc(writer.buf, size)
+        if alloc == NULL:
+            # Release writer's memory and then throw an error...
+            PyMem_Free(writer.buf)
+            PyErr_NoMemory()
+            return -1
+
+        writer.buf = alloc
+        writer.size = size
+
+    writer.buf[writer.index] = ch
+    writer.index += 1
+    return 0
+
+
+cdef inline int _unicode_writer__write_str(
+    UnicodeWriter* writer, str uni
+) except -1:
+    cdef Py_UCS4 ch
+    for ch in uni:
+        if _unicode_writer__write_char(writer, ch) < 0:
+            return -1
+    return 0
+
+cdef inline str _unicode_writer_finish(UnicodeWriter* writer):
+    return PyUnicode_FromKindAndData(writer.kind, writer.buf, writer.index)
+
+cdef inline void _unicode_writer_release(UnicodeWriter* writer):
+    if writer.buf != NULL:
+        PyMem_Free(writer.buf)
+
+# ----------------- End Unicode Writer ---------------------------
+
+
 cdef class _Unquoter:
     cdef str _ignore
     cdef bint _has_ignore
@@ -352,18 +421,23 @@ cdef class _Unquoter:
         if length == 0:
             return val
 
-        cdef list ret = []
+        cdef UnicodeWriter writer
+        cdef str ret
         cdef char buffer[4]
         cdef Py_ssize_t buflen = 0
         cdef Py_ssize_t consumed
         cdef str unquoted
+        cdef str h
         cdef Py_UCS4 ch = 0
         cdef long chl = 0
         cdef Py_ssize_t idx = 0
         cdef Py_ssize_t start_pct
         cdef int kind = PyUnicode_KIND(val)
         cdef const void *data = PyUnicode_DATA(val)
         cdef bint changed = 0
+
+        _unicode_writer_init(&writer, len(val), PyUnicode_4BYTE_KIND)
+
         while idx < length:
             ch = PyUnicode_READ(kind, data, idx)
             idx += 1
@@ -386,65 +460,68 @@ cdef class _Unquoter:
                         start_pct = idx - buflen * 3
                         buffer[0] = ch
                         buflen = 1
-                        ret.append(val[start_pct : idx - 3])
+                        _unicode_writer__write_str(&writer, val[start_pct : idx - 3])
                         try:
                             unquoted = PyUnicode_DecodeUTF8Stateful(buffer, buflen,
                                                                     NULL, &consumed)
                         except UnicodeDecodeError:
                             buflen = 0
-                            ret.append(val[idx - 3 : idx])
+                            _unicode_writer__write_str(&writer, val[idx - 3 : idx])
                             continue
                     if not unquoted:
                         assert consumed == 0
                         continue
                     assert consumed == buflen
                     buflen = 0
                     if self._qs and unquoted in '+=&;':
-                        ret.append(self._qs_quoter(unquoted))
+                        _unicode_writer__write_str(&writer, self._qs_quoter(unquoted))
                     elif (
                         (self._unsafe_bytes_len and unquoted in self._unsafe) or
                         (self._has_ignore and unquoted in self._ignore)
                     ):
-                        ret.append(self._quoter(unquoted))
+                        _unicode_writer__write_str(&writer, self._quoter(unquoted))
                     else:
-                        ret.append(unquoted)
+                        _unicode_writer__write_str(&writer, unquoted)
                     continue
                 else:
                     ch = '%'
 
             if buflen:
                 start_pct = idx - 1 - buflen * 3
-                ret.append(val[start_pct : idx - 1])
+                _unicode_writer__write_str(&writer, val[start_pct : idx - 1])
                 buflen = 0
 
             if ch == '+':
                 if (
                     (not self._qs and not self._plus) or
                     (self._unsafe_bytes_len and self._is_char_unsafe(ch))
                 ):
-                    ret.append('+')
+                    _unicode_writer__write_char(&writer, '+')
                 else:
                     changed = 1
-                    ret.append(' ')
+                    _unicode_writer__write_char(&writer, ' ')
                 continue
 
             if self._unsafe_bytes_len and self._is_char_unsafe(ch):
                 changed = 1
-                ret.append('%')
+                _unicode_writer__write_char(&writer, '%')
                 h = hex(ord(ch)).upper()[2:]
                 for ch in h:
-                    ret.append(ch)
+                    _unicode_writer__write_char(&writer, ch)
                 continue
 
-            ret.append(ch)
+            _unicode_writer__write_str(&writer, ch)
 
         if not changed:
+            _unicode_writer_release(&writer)
             return val
 
         if buflen:
-            ret.append(val[length - buflen * 3 : length])
+            _unicode_writer__write_str(&writer, val[length - buflen * 3 : length])
 
-        return ''.join(ret)
+        ret = _unicode_writer_finish(&writer)
+        _unicode_writer_release(&writer)
+        return ret
 
     cdef inline bint _is_char_unsafe(self, Py_UCS4 ch):
         for i in range(self._unsafe_bytes_len):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Optimized Unquote class with the use of a new Unicode writer based off the quoting writer -- by :user:`Vizonex`.