diff --git a/.gitignore b/.gitignore index 0081b62ae7f..64b03ed8e88 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,4 @@ sources var/* venv virtualenv.py +__pycache__ diff --git a/CHANGES/11320.feature.rst b/CHANGES/11320.feature.rst new file mode 100644 index 00000000000..193c86aa14d --- /dev/null +++ b/CHANGES/11320.feature.rst @@ -0,0 +1 @@ +Introduced Multidict C-API to cython http parser and http writer -- by :user:`Vizonex`. diff --git a/aiohttp/_http_parser.pyx b/aiohttp/_http_parser.pyx index f5015b297b0..1bdc496ee60 100644 --- a/aiohttp/_http_parser.pyx +++ b/aiohttp/_http_parser.pyx @@ -3,19 +3,38 @@ # Based on https://github.com/MagicStack/httptools # +# NOTE: I have scattered notes around this file +# Temporarily as I hunt for things to improve, Please know +# that my notes are all temporary and I plan to remove them +# when I convert the pull request from a draft to a real pull +# request. - Vizonex + from cpython cimport ( Py_buffer, PyBUF_SIMPLE, PyBuffer_Release, - PyBytes_AsString, PyBytes_AsStringAndSize, PyObject_GetBuffer, ) from cpython.mem cimport PyMem_Free, PyMem_Malloc +from cpython.object cimport PyObject from libc.limits cimport ULLONG_MAX from libc.string cimport memcpy +from multidict cimport ( + CIMultiDict, + CIMultiDict_Add, + CIMultiDict_Clear, + CIMultiDict_New, + CIMultiDict_UpdateFromDict, + CIMultiDict_UpdateFromMultiDict, + CIMultiDictProxy, + CIMultiDictProxy_Contains, + CIMultiDictProxy_GetOne, + CIMultiDictProxy_New, + istr, + multidict_import, +) -from multidict import CIMultiDict as _CIMultiDict, CIMultiDictProxy as _CIMultiDictProxy from yarl import URL as _URL from aiohttp import hdrs @@ -48,10 +67,13 @@ include "_headers.pxi" from aiohttp cimport _find_header +multidict_import() + ALLOWED_UPGRADES = frozenset({"websocket"}) DEF DEFAULT_FREELIST_SIZE = 250 cdef extern from "Python.h": + bytearray PyByteArray_FromStringAndSize(const char *string, Py_ssize_t len) int PyByteArray_Resize(object, Py_ssize_t) except -1 Py_ssize_t PyByteArray_Size(object) except -1 char* PyByteArray_AsString(object) @@ -61,29 +83,27 @@ __all__ = ('HttpRequestParser', 'HttpResponseParser', cdef object URL = _URL cdef object URL_build = URL.build -cdef object CIMultiDict = _CIMultiDict -cdef object CIMultiDictProxy = _CIMultiDictProxy cdef object HttpVersion = _HttpVersion cdef object HttpVersion10 = _HttpVersion10 cdef object HttpVersion11 = _HttpVersion11 -cdef object SEC_WEBSOCKET_KEY1 = hdrs.SEC_WEBSOCKET_KEY1 -cdef object CONTENT_ENCODING = hdrs.CONTENT_ENCODING +cdef istr SEC_WEBSOCKET_KEY1 = hdrs.SEC_WEBSOCKET_KEY1 +cdef istr CONTENT_ENCODING = hdrs.CONTENT_ENCODING cdef object EMPTY_PAYLOAD = _EMPTY_PAYLOAD cdef object StreamReader = _StreamReader cdef object DeflateBuffer = _DeflateBuffer cdef bytes EMPTY_BYTES = b"" -cdef inline object extend(object buf, const char* at, size_t length): +cdef inline int extend(object buf, const char* at, size_t length) except -1: cdef Py_ssize_t s cdef char* ptr s = PyByteArray_Size(buf) - PyByteArray_Resize(buf, s + length) + if PyByteArray_Resize(buf, s + length) < 0: + return -1 ptr = PyByteArray_AsString(buf) memcpy(ptr + s, at, length) - + return 0 DEF METHODS_COUNT = 46; - cdef list _http_method = [] for i in range(METHODS_COUNT): @@ -108,12 +128,13 @@ cdef inline object find_header(bytes raw_header): return headers[idx] + @cython.freelist(DEFAULT_FREELIST_SIZE) cdef class RawRequestMessage: cdef readonly str method cdef readonly str path cdef readonly object version # HttpVersion - cdef readonly object headers # CIMultiDict + cdef readonly CIMultiDictProxy headers # CIMultiDictProxy[str] cdef readonly object raw_headers # tuple cdef readonly object should_close cdef readonly object compression @@ -135,7 +156,8 @@ cdef class RawRequestMessage: self.url = url def __repr__(self): - info = [] + # NOTE: This is Experimental, I might revert this later... + cdef list info = [] info.append(("method", self.method)) info.append(("path", self.path)) info.append(("version", self.version)) @@ -186,7 +208,7 @@ cdef class RawRequestMessage: cdef _new_request_message(str method, str path, object version, - object headers, + CIMultiDictProxy headers, object raw_headers, bint should_close, object compression, @@ -208,12 +230,17 @@ cdef _new_request_message(str method, return ret +# TODO: headers can sometimes come in as a different objects other than +# CIMultiDictProxy, this might be a problem if we wish to optimize these +# class datatypes further since some tests like to throw in a few +# curve balls int the headers argument. + @cython.freelist(DEFAULT_FREELIST_SIZE) cdef class RawResponseMessage: cdef readonly object version # HttpVersion cdef readonly int code cdef readonly str reason - cdef readonly object headers # CIMultiDict + cdef readonly CIMultiDictProxy headers # CIMultiDictProxy[str] cdef readonly object raw_headers # tuple cdef readonly object should_close cdef readonly object compression @@ -233,7 +260,7 @@ cdef class RawResponseMessage: self.chunked = chunked def __repr__(self): - info = [] + cdef list info = [] info.append(("version", self.version)) info.append(("code", self.code)) info.append(("reason", self.reason)) @@ -250,7 +277,7 @@ cdef class RawResponseMessage: cdef _new_response_message(object version, int code, str reason, - object headers, + CIMultiDictProxy headers, object raw_headers, bint should_close, object compression, @@ -297,7 +324,7 @@ cdef class HttpParser: bytearray _buf str _path str _reason - list _headers + CIMultiDict _headers list _raw_headers bint _upgraded list _messages @@ -345,7 +372,7 @@ cdef class HttpParser: self._loop = loop self._timer = timer - self._buf = bytearray() + self._buf = PyByteArray_FromStringAndSize(NULL, 0) self._payload = None self._payload_error = 0 self._payload_exception = payload_exception @@ -384,7 +411,7 @@ cdef class HttpParser: name = find_header(self._raw_name) value = self._raw_value.decode('utf-8', 'surrogateescape') - self._headers.append((name, value)) + CIMultiDict_Add(self._headers, name, value) if name is CONTENT_ENCODING: self._content_encoding = value @@ -398,6 +425,8 @@ cdef class HttpParser: if self._has_value: self._process_header() + # TODO: I would like to use the CAPI for Python bytes + # instead, python slices can be a bottlekneck if self._raw_name is EMPTY_BYTES: self._raw_name = at[:length] else: @@ -411,6 +440,11 @@ cdef class HttpParser: self._has_value = True cdef _on_headers_complete(self): + cdef CIMultiDictProxy headers + cdef PyObject* upgrade_value + cdef unsigned char upgrade + cdef int chunked + self._process_header() should_close = not cparser.llhttp_should_keep_alive(self._cparser) @@ -418,18 +452,18 @@ cdef class HttpParser: chunked = self._cparser.flags & cparser.F_CHUNKED raw_headers = tuple(self._raw_headers) - headers = CIMultiDictProxy(CIMultiDict(self._headers)) + headers = CIMultiDictProxy_New(self._headers) if self._cparser.type == cparser.HTTP_REQUEST: - allowed = upgrade and headers.get("upgrade", "").lower() in ALLOWED_UPGRADES - if allowed or self._cparser.method == cparser.HTTP_CONNECT: - self._upgraded = True + if CIMultiDictProxy_GetOne(headers, "upgrade", &upgrade_value): + self._upgraded = (upgrade_value).lower() in ALLOWED_UPGRADES + self._upgraded = self._upgraded or self._cparser.method == cparser.HTTP_CONNECT else: if upgrade and self._cparser.status_code == 101: self._upgraded = True # do not support old websocket spec - if SEC_WEBSOCKET_KEY1 in headers: + if CIMultiDictProxy_Contains(headers, SEC_WEBSOCKET_KEY1): raise InvalidHeader(SEC_WEBSOCKET_KEY1) encoding = None @@ -569,7 +603,7 @@ cdef class HttpParser: return messages, False, b"" def set_upgraded(self, val): - self._upgraded = val + self._upgraded = val cdef class HttpRequestParser(HttpParser): @@ -596,7 +630,7 @@ cdef class HttpRequestParser(HttpParser): if self._cparser.method == cparser.HTTP_CONNECT: # authority-form, # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.3 - self._url = URL.build(authority=self._path, encoded=True) + self._url = URL_build(authority=self._path, encoded=True) elif idx3 > 1 and self._path[0] == '/': # origin-form, # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1 @@ -622,7 +656,7 @@ cdef class HttpRequestParser(HttpParser): query = self._path[idx1: idx2] fragment = self._path[idx2+1:] - self._url = URL.build( + self._url = URL_build( path=path, query_string=query, fragment=fragment, @@ -666,7 +700,8 @@ cdef int cb_on_message_begin(cparser.llhttp_t* parser) except -1: cdef HttpParser pyparser = parser.data pyparser._started = True - pyparser._headers = [] + # I would assume 5 is a good starting number let me know if it should be higher... + pyparser._headers = CIMultiDict_New(5) pyparser._raw_headers = [] PyByteArray_Resize(pyparser._buf, 0) pyparser._path = None diff --git a/aiohttp/_http_writer.pyx b/aiohttp/_http_writer.pyx index 4a3ae1f9e68..c1e802c0126 100644 --- a/aiohttp/_http_writer.pyx +++ b/aiohttp/_http_writer.pyx @@ -1,17 +1,43 @@ +# cython: freethreading_compatible = True +cimport cython from cpython.bytes cimport PyBytes_FromStringAndSize -from cpython.exc cimport PyErr_NoMemory +from cpython.exc cimport PyErr_NoMemory, PyErr_SetObject from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc -from cpython.object cimport PyObject_Str +from cpython.object cimport PyObject +from cpython.unicode cimport PyUnicode_Check, PyUnicode_CheckExact from libc.stdint cimport uint8_t, uint64_t from libc.string cimport memcpy +from multidict cimport ( + IStr_CheckExact, + MultiDictIter_New, + MultiDictIter_Next, + multidict_import, +) -from multidict import istr + +# Cython version should be a return type of str, +# Redoing the function signature should help eliminate +# a costly string check Otherwise A new function for the +# Multidict-CAPI should be looked into +cdef extern from "Python.h": + str PyObject_Str(object obj) + + +# NOTE: Cython API is Experimental and is held subject to change +# Depending on different circumstances. +# Remove this comment when draft is officially over +# or when 6.7 is released with the official names. +# This may or may not be what the other authors had in mind. +# My todos are held subject to removal when Draft is transformed +# into a real pull request. + + +# Run first thing so that Capsule imports... +multidict_import() DEF BUF_SIZE = 16 * 1024 # 16KiB cdef char BUFFER[BUF_SIZE] -cdef object _istr = istr - # ----------------- writer --------------------------- @@ -21,18 +47,18 @@ cdef struct Writer: Py_ssize_t pos -cdef inline void _init_writer(Writer* writer): +cdef inline void _init_writer(Writer* writer) noexcept: writer.buf = &BUFFER[0] writer.size = BUF_SIZE writer.pos = 0 -cdef inline void _release_writer(Writer* writer): +cdef inline void _release_writer(Writer* writer) noexcept: if writer.buf != BUFFER: PyMem_Free(writer.buf) -cdef inline int _write_byte(Writer* writer, uint8_t ch): +cdef inline int _write_byte(Writer* writer, uint8_t ch) except -1: cdef char * buf cdef Py_ssize_t size @@ -57,7 +83,7 @@ cdef inline int _write_byte(Writer* writer, uint8_t ch): return 0 -cdef inline int _write_utf8(Writer* writer, Py_UCS4 symbol): +cdef inline int _write_utf8(Writer* writer, Py_UCS4 symbol) except -1: cdef uint64_t utf = symbol if utf < 0x80: @@ -90,45 +116,56 @@ cdef inline int _write_utf8(Writer* writer, Py_UCS4 symbol): return _write_byte(writer, (0x80 | (utf & 0x3f))) -cdef inline int _write_str(Writer* writer, str s): +cdef inline int _write_str(Writer* writer, str s) except -1: cdef Py_UCS4 ch + if not PyUnicode_Check(s): + PyErr_SetObject(ValueError, "Invalid status-line: {!r}".format(s)) + return -1 for ch in s: if _write_utf8(writer, ch) < 0: return -1 + return 0 - -cdef inline int _write_str_raise_on_nlcr(Writer* writer, object s): +cdef inline int _write_str_raise_on_nlcr(Writer* writer, object s) except -1: cdef Py_UCS4 ch cdef str out_str - if type(s) is str: + + if PyUnicode_CheckExact(s): out_str = s - elif type(s) is _istr: + elif IStr_CheckExact(s): out_str = PyObject_Str(s) - elif not isinstance(s, str): - raise TypeError("Cannot serialize non-str key {!r}".format(s)) + elif not PyUnicode_Check(s): + PyErr_SetObject(TypeError, "Cannot serialize non-str key {!r}".format(s)) + return -1 else: out_str = str(s) for ch in out_str: if ch == 0x0D or ch == 0x0A: - raise ValueError( + PyErr_SetObject(ValueError, "Newline or carriage return detected in headers. " "Potential header injection attack." ) - if _write_utf8(writer, ch) < 0: return -1 + if _write_utf8(writer, ch) < 0: + return -1 + return 0 # --------------- _serialize_headers ---------------------- -def _serialize_headers(str status_line, headers): - cdef Writer writer - cdef object key - cdef object val + +def _serialize_headers(str status_line, object headers): + cdef Writer writer + cdef PyObject* key + cdef PyObject* val + cdef object multidict_iter _init_writer(&writer) try: + multidict_iter = MultiDictIter_New(headers) + if _write_str(&writer, status_line) < 0: raise if _write_byte(&writer, b'\r') < 0: @@ -136,14 +173,15 @@ def _serialize_headers(str status_line, headers): if _write_byte(&writer, b'\n') < 0: raise - for key, val in headers.items(): - if _write_str_raise_on_nlcr(&writer, key) < 0: + while MultiDictIter_Next(multidict_iter, &key, &val): + + if _write_str_raise_on_nlcr(&writer, key) < 0: raise if _write_byte(&writer, b':') < 0: raise if _write_byte(&writer, b' ') < 0: raise - if _write_str_raise_on_nlcr(&writer, val) < 0: + if _write_str_raise_on_nlcr(&writer, val) < 0: raise if _write_byte(&writer, b'\r') < 0: raise diff --git a/aiohttp/web_protocol.py b/aiohttp/web_protocol.py index cdeb423554f..3d5e17bf5b3 100644 --- a/aiohttp/web_protocol.py +++ b/aiohttp/web_protocol.py @@ -24,6 +24,7 @@ ) import yarl +from multidict import CIMultiDict, CIMultiDictProxy from propcache import under_cached_property from .abc import AbstractAccessLogger, AbstractAsyncAccessLogger, AbstractStreamWriter @@ -75,8 +76,8 @@ "UNKNOWN", "/", HttpVersion10, - {}, # type: ignore[arg-type] - {}, # type: ignore[arg-type] + CIMultiDictProxy(CIMultiDict()), + tuple(), True, None, False, @@ -615,7 +616,7 @@ async def start(self) -> None: payload, self, writer, - self._task_handler or asyncio.current_task(loop), # type: ignore[arg-type] + self._task_handler or asyncio.current_task(loop), ) try: # a new task is used for copy context vars (#3406) diff --git a/pyproject.toml b/pyproject.toml index a9b4200a06c..ea25255ad66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,6 +2,8 @@ requires = [ "pkgconfig", "setuptools >= 46.4.0", + # Temporarily redirect it to me, will revert when 6.7 gets released + "multidict @ git+https://github.com/Vizonex/multidict@capi" ] build-backend = "setuptools.build_meta" diff --git a/requirements/base.txt b/requirements/base.txt index 3faddb12da8..0faeec61887 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -24,7 +24,10 @@ gunicorn==23.0.0 # via -r requirements/base.in idna==3.6 # via yarl -multidict==6.6.3 +# Temporarily redirect it to me, will revert when 6.7 gets released +multidict @ git+https://github.com/Vizonex/multidict@capi + +# multidict==6.6.3 # via # -r requirements/runtime-deps.in # yarl diff --git a/requirements/constraints.txt b/requirements/constraints.txt index 92c62c691d6..fd2cd8668ce 100644 --- a/requirements/constraints.txt +++ b/requirements/constraints.txt @@ -113,7 +113,10 @@ markupsafe==3.0.2 # via jinja2 mdurl==0.1.2 # via markdown-it-py -multidict==6.6.3 +# Temporarily redirect it to me, will revert when 6.7 gets released +multidict @ git+https://github.com/Vizonex/multidict@capi + +# multidict==6.6.3 # via # -r requirements/multidict.in # -r requirements/runtime-deps.in diff --git a/requirements/cython.txt b/requirements/cython.txt index 39257b77599..c144bb248c0 100644 --- a/requirements/cython.txt +++ b/requirements/cython.txt @@ -6,7 +6,11 @@ # cython==3.1.2 # via -r requirements/cython.in -multidict==6.6.3 + +# Temporarily redirect it to me, will revert when 6.7 gets released +multidict @ git+https://github.com/Vizonex/multidict@capi + +# multidict==6.6.3 # via -r /home/dependabot/dependabot-updater/tmp/20250715-1382-lngh7e/dependabot_20250715-1382-a7k872/requirements/multidict.in typing-extensions==4.14.1 # via multidict diff --git a/requirements/dev.txt b/requirements/dev.txt index af047b92626..ac5b9235af1 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -111,7 +111,10 @@ markupsafe==3.0.2 # via jinja2 mdurl==0.1.2 # via markdown-it-py -multidict==6.6.3 +# Temporarily redirect it to me, will revert when 6.7 gets released +multidict @ git+https://github.com/Vizonex/multidict@capi + +# multidict==6.6.3 # via # -r requirements/runtime-deps.in # yarl diff --git a/requirements/multidict.txt b/requirements/multidict.txt index 6f90d5c4c34..f186799b958 100644 --- a/requirements/multidict.txt +++ b/requirements/multidict.txt @@ -4,7 +4,8 @@ # # pip-compile --allow-unsafe --output-file=requirements/multidict.txt --resolver=backtracking --strip-extras requirements/multidict.in # -multidict==6.6.3 +# Temporarily redirect it to me, will revert when 6.7 gets released +multidict @ git+https://github.com/Vizonex/multidict@capi # via -r requirements/multidict.in typing-extensions==4.14.1 # via multidict diff --git a/requirements/runtime-deps.txt b/requirements/runtime-deps.txt index 327fae9055b..a5c762d8220 100644 --- a/requirements/runtime-deps.txt +++ b/requirements/runtime-deps.txt @@ -22,7 +22,10 @@ frozenlist==1.7.0 # aiosignal idna==3.6 # via yarl -multidict==6.6.3 + +# Temporarily redirect it to me, will revert when 6.7 gets released +multidict @ git+https://github.com/Vizonex/multidict@capi + # via # -r requirements/runtime-deps.in # yarl diff --git a/requirements/test.txt b/requirements/test.txt index 7bfa1c7a195..7e601a4c740 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -57,7 +57,8 @@ markdown-it-py==3.0.0 # via rich mdurl==0.1.2 # via markdown-it-py -multidict==6.6.3 +# Temporarily redirect it to me, I will revert when 6.7 gets released +multidict @ git+https://github.com/Vizonex/multidict@capi # via # -r requirements/runtime-deps.in # yarl diff --git a/setup.py b/setup.py index fded89876f2..c401327e5a8 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,7 @@ import pathlib import sys +import multidict from setuptools import Extension, setup if sys.version_info < (3, 9): @@ -51,7 +52,7 @@ ] llhttp_kwargs = { "define_macros": [("LLHTTP_STRICT_MODE", 0)], - "include_dirs": ["vendor/llhttp/build"], + "include_dirs": ["vendor/llhttp/build"] + multidict.__path__, } extensions = [ @@ -65,7 +66,11 @@ ], **llhttp_kwargs, ), - Extension("aiohttp._http_writer", ["aiohttp/_http_writer.c"]), + Extension( + "aiohttp._http_writer", + ["aiohttp/_http_writer.c"], + include_dirs=multidict.__path__, + ), Extension("aiohttp._websocket.reader_c", ["aiohttp/_websocket/reader_c.c"]), ] diff --git a/tests/__pycache__/test_payload.cpython-310-pytest-8.4.1.pyc.25876 b/tests/__pycache__/test_payload.cpython-310-pytest-8.4.1.pyc.25876 new file mode 100644 index 00000000000..204ec3430d4 Binary files /dev/null and b/tests/__pycache__/test_payload.cpython-310-pytest-8.4.1.pyc.25876 differ diff --git a/tests/__pycache__/test_proxy.cpython-310-pytest-8.4.1.pyc.25940 b/tests/__pycache__/test_proxy.cpython-310-pytest-8.4.1.pyc.25940 new file mode 100644 index 00000000000..275dbe09b17 Binary files /dev/null and b/tests/__pycache__/test_proxy.cpython-310-pytest-8.4.1.pyc.25940 differ diff --git a/tests/__pycache__/test_web_sendfile.cpython-310-pytest-8.4.1.pyc.25496 b/tests/__pycache__/test_web_sendfile.cpython-310-pytest-8.4.1.pyc.25496 new file mode 100644 index 00000000000..3db1f71a5dd Binary files /dev/null and b/tests/__pycache__/test_web_sendfile.cpython-310-pytest-8.4.1.pyc.25496 differ