diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 5d9f94fe..c2e526db 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -24,8 +24,9 @@ bug report! * `Ade Oshineye `_ * `Tom Parker-Shemilt `_ * `Martin Pool `_ +* `Nestor Rodriguez `_ * `Sam Ruby `_ * `Bernd Schlapsi `_ * `Aaron Swartz `_ +* `Jonathan Vanasco `_ * `Jakub Wilk `_ -* `Nestor Rodriguez `_ diff --git a/feedparser/api.py b/feedparser/api.py index 8b95ed5c..f6e7c920 100644 --- a/feedparser/api.py +++ b/feedparser/api.py @@ -30,7 +30,7 @@ import urllib.error import urllib.parse import xml.sax -from typing import IO +from typing import IO, Any from . import http from .encodings import MissingEncoding, convert_file_to_utf8 @@ -72,9 +72,10 @@ def _open_resource( - url_file_stream_or_string, - result, -): + url_file_stream_or_string: Any, + result: dict, + requests_hooks: http.RequestHooks | None = None, +) -> tuple[str, Any]: """URL, filename, or string --> stream This function lets you define parsers that take any input source @@ -83,7 +84,11 @@ def _open_resource( to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. - :return: A seekable, readable file object. + :param requests_hooks: + A dict of hooks to pass onto :method:`requests.get` if a URL is parsed. + See `feedparser.http.RequestHooks` + + :return: A Tuple of [the method used, a seekable and readable file object]. """ # Some notes on the history of the implementation of _open_resource(). @@ -104,8 +109,8 @@ def _open_resource( if callable(getattr(url_file_stream_or_string, "read", None)): if callable(getattr(url_file_stream_or_string, "seekable", None)): if url_file_stream_or_string.seekable(): - return url_file_stream_or_string - return _to_in_memory_file(url_file_stream_or_string.read()) + return "seekable", url_file_stream_or_string + return "read", _to_in_memory_file(url_file_stream_or_string.read()) looks_like_url = isinstance( url_file_stream_or_string, str @@ -114,12 +119,12 @@ def _open_resource( "https", ) if looks_like_url: - data = http.get(url_file_stream_or_string, result) - return io.BytesIO(data) + data = http.get(url_file_stream_or_string, result, hooks=requests_hooks) + return "url", io.BytesIO(data) # try to open with native open function (if url_file_stream_or_string is a filename) try: - return open(url_file_stream_or_string, "rb") + return "filepath", open(url_file_stream_or_string, "rb") except (OSError, TypeError, ValueError): # if url_file_stream_or_string is a str object that # cannot be converted to the encoding returned by @@ -131,7 +136,7 @@ def _open_resource( pass # treat url_file_stream_or_string as bytes/string - return _to_in_memory_file(url_file_stream_or_string) + return "raw_data", _to_in_memory_file(url_file_stream_or_string) def _to_in_memory_file(data): @@ -154,6 +159,8 @@ def parse( resolve_relative_uris: bool | None = None, sanitize_html: bool | None = None, optimistic_encoding_detection: bool | None = None, + archive_url_data: bool | None = None, + requests_hooks: http.RequestHooks | None = None, ) -> FeedParserDict: """Parse a feed from a URL, file, stream, or string. @@ -188,7 +195,12 @@ def parse( (uses less memory, but the wrong encoding may be detected in rare cases). Defaults to the value of :data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``. - + :param archive_url_data: + Should feedparser archive the URL headers and content into + :attr:`FeedParserDict.raw` ? Defaults to ``False``` + :param requests_hooks: + A dict of hooks to pass onto :method:`requests.get` if a URL is parsed. + See `feedparser.http.RequestHooks` """ result = FeedParserDict( @@ -196,13 +208,22 @@ def parse( entries=[], feed=FeedParserDict(), headers={}, + raw={}, ) try: - file = _open_resource( + _method, file = _open_resource( url_file_stream_or_string, result, + requests_hooks=requests_hooks, ) + if _method == "url" and archive_url_data: + # archive the headers before they are mutated by `response_headers` + result.raw["headers"] = result["headers"].copy() + # archive the content, then reset the file + result.raw["content"] = file.read() + file.seek(0) + except urllib.error.URLError as error: result.update( { diff --git a/feedparser/http.py b/feedparser/http.py index 7768dae5..e21357a4 100644 --- a/feedparser/http.py +++ b/feedparser/http.py @@ -30,6 +30,7 @@ import typing import requests +from typing_extensions import NotRequired # >=py311 from .datetimes import _parse_date @@ -45,19 +46,41 @@ ";q=0.1" ) +# This dict defines the allowable hooks. +# `response` is the only valid hook in `requests`. +# `response.postprocess` is used +RequestHooks = typing.TypedDict( + "RequestHooks", + { + "response": typing.Union[typing.Callable, typing.Sequence[typing.Callable]], + "response.postprocess": NotRequired[typing.Callable], + }, +) + -def get(url: str, result: dict[str, typing.Any]) -> bytes: +def get( + url: str, + result: dict[str, typing.Any], + hooks: RequestHooks | None = None, +) -> bytes: + _postprocess: typing.Callable | None = None + if hooks is not None: + _postprocess = hooks.pop("response.postprocess", None) try: response = requests.get( url, headers={"Accept": ACCEPT_HEADER}, timeout=10, + hooks=hooks, ) except requests.RequestException as exception: result["bozo"] = True result["bozo_exception"] = exception return b"" + if _postprocess is not None: + _postprocess(response, result) + # Lowercase the HTTP header keys for comparisons per RFC 2616. result["headers"] = {k.lower(): v for k, v in response.headers.items()} diff --git a/pyproject.toml b/pyproject.toml index dbc32538..d4ae584e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ classifiers = [ dependencies = [ "sgmllib3k==1.0.0", "requests>=2.20.0", + "typing_extensions>=4.0.0", # NotRequired >=py311 ] [project.urls] diff --git a/tests/test_open_resource.py b/tests/test_open_resource.py index db575711..71c3510a 100644 --- a/tests/test_open_resource.py +++ b/tests/test_open_resource.py @@ -4,29 +4,39 @@ def test_fileobj(): - r = feedparser.api._open_resource(io.BytesIO(b""), {}).read() + method, filelike = feedparser.api._open_resource(io.BytesIO(b""), {}) + r = filelike.read() assert r == b"" + assert method == "seekable" def testbytes(): s = b"text" - r = feedparser.api._open_resource(s, {}).read() + method, filelike = feedparser.api._open_resource(s, {}) + r = filelike.read() assert s == r + assert method == "raw_data" def test_string(): s = b"text" - r = feedparser.api._open_resource(s, {}).read() + method, filelike = feedparser.api._open_resource(s, {}) + r = filelike.read() assert s == r + assert method == "raw_data" def test_unicode_1(): s = b"text" - r = feedparser.api._open_resource(s, {}).read() + method, filelike = feedparser.api._open_resource(s, {}) + r = filelike.read() assert s == r + assert method == "raw_data" def test_unicode_2(): s = rb"t\u00e9xt" - r = feedparser.api._open_resource(s, {}).read() + method, filelike = feedparser.api._open_resource(s, {}) + r = filelike.read() assert s == r + assert method == "raw_data"