diff --git a/.gitignore b/.gitignore index 3fe67fd1..e967c85a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ dist docs/_build _trial_temp .coverage -.cache \ No newline at end of file +.cache +.mypy_cache/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index dfeee83b..9b6642a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,6 +18,10 @@ matrix: sudo: true - python: 3.5 env: TOXENV=pypy3 + - python: 3.6 + env: TOXENV=mypy2 + - python: 3.6 + env: TOXENV=mypy3 install: - | diff --git a/setup.py b/setup.py index 2ae088c9..44fdf0e2 100644 --- a/setup.py +++ b/setup.py @@ -29,5 +29,5 @@ 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Internet :: WWW/HTTP', ], - install_requires=['six >= 1.4.1'], + install_requires=['six >= 1.4.1', 'typing'], ) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 649c189a..5c2ff0f4 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -3,6 +3,7 @@ from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode, http_content_type_encoding, resolve_encoding, html_to_unicode) + class RequestEncodingTests(unittest.TestCase): utf8_fragments = [ # Content-Type as meta http-equiv @@ -51,6 +52,7 @@ def test_html_body_declared_encoding(self): for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment) self.assertEqual(encoding, 'utf-8', fragment) + self.assertEqual(None, html_body_declared_encoding(b"something else")) self.assertEqual(None, html_body_declared_encoding(b"""
@@ -76,6 +78,11 @@ def test_html_body_declared_encoding_unicode(self): self.assertEqual(None, html_body_declared_encoding( u"""""")) + def test_html_body_declared_encoding_aliases(self): + fragment = b"""""" + self.assertEqual("cp1251", html_body_declared_encoding(fragment)) + self.assertEqual("cp1251", html_body_declared_encoding(fragment.decode('utf8'))) + class CodecsEncodingTestCase(unittest.TestCase): def test_resolve_encoding(self): @@ -97,9 +104,11 @@ def test_invalid_utf8(self): def ct(charset): return "Content-Type: text/html; charset=" + charset if charset else None + def norm_encoding(enc): return codecs.lookup(enc).name + class HtmlConversionTests(unittest.TestCase): def test_unicode_body(self): diff --git a/tox.ini b/tox.ini index 0e5a0b39..a43b8b3a 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27, pypy, py34, py35, py36, py37, pypy3 +envlist = py27, pypy, py34, py35, py36, py37, pypy3, mypy2, mypy3 [testenv] deps = @@ -15,3 +15,20 @@ commands = --doctest-modules \ --cov=w3lib --cov-report=term \ {posargs:w3lib tests} + + +[testenv:mypy2] +basepython = python3.6 +deps = + mypy + typing +commands = + mypy --py2 w3lib tests + + +[testenv:mypy3] +basepython = python3.6 +deps = + mypy +commands = + mypy w3lib tests diff --git a/w3lib/_types.py b/w3lib/_types.py new file mode 100644 index 00000000..324a3e49 --- /dev/null +++ b/w3lib/_types.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +""" +Which string type to use? +========================= + +1. Variable is an URL ==> use ``str`` +2. Variable is binary; unicode is not accepted ==> use ``bytes`` +3. Variable is text, and it can be only unicode in Python 2 ==> use + ``six.text_type`` (or typing.Text??) +4. Variable is text, but it can be ascii or utf8-encoded str + in Python 2 ==> use w3lib._types.String +5. Variable can be either bytes or unicode both in Python 2 + and Python 3 ==> use typing.AnyStr +6. Variable should be str (==bytes) in Python 2 + and str (==unicode) in Python 3 ==> use ``str``. + +""" +from __future__ import absolute_import +from typing import Union +import six + +if six.PY2: + String = Union[bytes, unicode] +else: + String = str diff --git a/w3lib/encoding.py b/w3lib/encoding.py index c7ac567f..3d613fc1 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -2,12 +2,20 @@ """ Functions for handling encoding of web pages """ -import re, codecs, encodings +import re +import codecs +import encodings # type: ignore from sys import version_info +from typing import Optional, AnyStr, Tuple, Callable, Union, cast +import six + +from .util import to_native_str _HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I) + def http_content_type_encoding(content_type): + # type: (Optional[str]) -> Optional[str] """Extract the encoding in the content-type header >>> import w3lib.encoding @@ -20,6 +28,8 @@ def http_content_type_encoding(content_type): match = _HEADER_ENCODING_RE.search(content_type) if match: return resolve_encoding(match.group(1)) + return None + # regexp for parsing HTTP meta tags _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' @@ -40,13 +50,15 @@ def http_content_type_encoding(content_type): _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P .*?)(?P=quote)'), re.DOTALL | re.IGNORECASE) _cdata_re = re.compile(r'((?P .*?)(?P \]\]>))', re.DOTALL) @@ -38,7 +40,9 @@ def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): return replace_entities(text, keep, remove_illegal, encoding) + def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): + # type: (AnyStr, Sequence[String], bool, String) -> six.text_type u"""Remove entities from the given `text` by converting them to their corresponding unicode character. @@ -96,14 +100,19 @@ def convert_entity(m): return _ent_re.sub(convert_entity, to_unicode(text, encoding)) + def has_entities(text, encoding=None): + # type: (AnyStr, Optional[String]) -> bool return bool(_ent_re.search(to_unicode(text, encoding))) + def replace_tags(text, token='', encoding=None): + # type: (AnyStr, String, Optional[String]) -> six.text_type + """Replace all markup tags found in the given `text` by the given token. By default `token` is an empty string so it just removes all tags. - `text` can be a unicode string or a regular string encoded as `encoding` + `text` can be a unicode string or a byte string encoded as `encoding` (or ``'utf-8'`` if `encoding` is not given.) Always returns a unicode string. @@ -124,6 +133,7 @@ def replace_tags(text, token='', encoding=None): _REMOVECOMMENTS_RE = re.compile(u'', re.DOTALL) def remove_comments(text, encoding=None): + # type: (AnyStr, Optional[String]) -> six.text_type """ Remove HTML Comments. >>> import w3lib.html @@ -133,10 +143,13 @@ def remove_comments(text, encoding=None): """ - text = to_unicode(text, encoding) - return _REMOVECOMMENTS_RE.sub(u'', text) + text_unicode = to_unicode(text, encoding) + return _REMOVECOMMENTS_RE.sub(u'', text_unicode) + def remove_tags(text, which_ones=(), keep=(), encoding=None): + # type: (AnyStr, Sequence[String], Sequence[String], Optional[String]) -> six.text_type + """ Remove HTML Tags only. `which_ones` and `keep` are both tuples, there are four cases: @@ -185,26 +198,28 @@ def remove_tags(text, which_ones=(), keep=(), encoding=None): assert not (which_ones and keep), 'which_ones and keep can not be given at the same time' - which_ones = {tag.lower() for tag in which_ones} - keep = {tag.lower() for tag in keep} + which_ones_ = {tag.lower() for tag in which_ones} + keep_ = {tag.lower() for tag in keep} def will_remove(tag): tag = tag.lower() - if which_ones: - return tag in which_ones + if which_ones_: + return tag in which_ones_ else: - return tag not in keep + return tag not in keep_ def remove_tag(m): tag = m.group(1) return u'' if will_remove(tag) else m.group(0) - regex = '?([^ >/]+).*?>' + regex = u'?([^ >/]+).*?>' retags = re.compile(regex, re.DOTALL | re.IGNORECASE) return retags.sub(remove_tag, to_unicode(text, encoding)) + def remove_tags_with_content(text, which_ones=(), encoding=None): + # type: (AnyStr, Sequence[String], Optional[String]) -> six.text_type """Remove tags and their content. `which_ones` is a tuple of which tags to remove including their content. @@ -218,16 +233,18 @@ def remove_tags_with_content(text, which_ones=(), encoding=None): """ - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) if which_ones: - tags = '|'.join([r'<%s.*?%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) + tags = u'|'.join([r'<%s.*?%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) - text = retags.sub(u'', text) - return text + text_unicode = retags.sub(u'', text_unicode) + return text_unicode + +def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='', + encoding=None): + # type: (AnyStr, Sequence[String], String, Optional[String]) -> six.text_type -def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \ - encoding=None): """Remove escape characters. `which_ones` is a tuple of which escape characters we want to remove. @@ -238,12 +255,15 @@ def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \ """ - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) for ec in which_ones: - text = text.replace(ec, to_unicode(replace_by, encoding)) - return text + text_unicode = text_unicode.replace(ec, to_unicode(replace_by, encoding)) + return text_unicode + def unquote_markup(text, keep=(), remove_illegal=True, encoding=None): + # type: (AnyStr, Sequence[String], bool, Optional[String]) -> six.text_type + """ This function receives markup as a text (always a unicode string or a UTF-8 encoded string) and does the following: @@ -264,27 +284,29 @@ def _get_fragments(txt, pattern): offset = match_e yield txt[offset:] - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) ret_text = u'' - for fragment in _get_fragments(text, _cdata_re): + for fragment in _get_fragments(text_unicode, _cdata_re): if isinstance(fragment, six.string_types): # it's not a CDATA (so we try to remove its entities) - ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal) + # XXX: mypy has problems with six.string_types, + # had to ignore this type check + ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal) # type: ignore else: # it's a CDATA (so we just extract its content) ret_text += fragment.group('cdata_d') return ret_text + def get_base_url(text, baseurl='', encoding='utf-8'): + # type: (AnyStr, str, String) -> str """Return the base url if declared in the given HTML `text`, relative to the given base url. If no base url is found, the given `baseurl` is returned. - """ - - text = to_unicode(text, encoding) - m = _baseurl_re.search(text) + text_unicode = to_unicode(text, encoding) + m = _baseurl_re.search(text_unicode) if m: return moves.urllib.parse.urljoin( safe_url_string(baseurl), @@ -293,30 +315,30 @@ def get_base_url(text, baseurl='', encoding='utf-8'): else: return safe_url_string(baseurl) + def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): + # type: (AnyStr, str, String, Sequence[String]) -> Union[Tuple[float, str], Tuple[None, None]] """Return the http-equiv parameter of the HTML meta element from the given - HTML text and return a tuple ``(interval, url)`` where interval is an integer + HTML text and return a tuple ``(interval, url)`` where interval is a number containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, ``(None, None)`` is returned. """ - - if six.PY2: - baseurl = to_bytes(baseurl, encoding) + baseurl_str = to_native_str(baseurl) try: - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) except UnicodeDecodeError: print(text) raise - text = remove_tags_with_content(text, ignore_tags) - text = remove_comments(replace_entities(text)) - m = _meta_refresh_re.search(text) + text_unicode = remove_tags_with_content(text_unicode, ignore_tags) + text_unicode = remove_comments(replace_entities(text_unicode)) + m = _meta_refresh_re.search(text_unicode) if m: interval = float(m.group('int')) url = safe_url_string(m.group('url').strip(' "\''), encoding) - url = moves.urllib.parse.urljoin(baseurl, url) + url = moves.urllib.parse.urljoin(baseurl_str, url) return interval, url else: return None, None diff --git a/w3lib/http.py b/w3lib/http.py index c7b94a23..848fac7a 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,7 +1,12 @@ +from __future__ import absolute_import from base64 import urlsafe_b64encode +from typing import Dict, List, Optional, Union, Any + +from ._types import String def headers_raw_to_dict(headers_raw): + # type: (Optional[bytes]) -> Optional[Dict[bytes, List[bytes]]] r""" Convert raw headers (single multi-line bytestring) to a dictionary. @@ -10,7 +15,7 @@ def headers_raw_to_dict(headers_raw): >>> import w3lib.http >>> w3lib.http.headers_raw_to_dict(b"Content-type: text/html\n\rAccept: gzip\n\n") # doctest: +SKIP - {'Content-type': ['text/html'], 'Accept': ['gzip']} + {b'Content-type': [b'text/html'], b'Accept': [b'gzip']} Incorrect input: @@ -30,7 +35,7 @@ def headers_raw_to_dict(headers_raw): headers = headers_raw.splitlines() headers_tuples = [header.split(b':', 1) for header in headers] - result_dict = {} + result_dict = {} # type: Dict[bytes, List[bytes]] for header_item in headers_tuples: if not len(header_item) == 2: continue @@ -47,6 +52,7 @@ def headers_raw_to_dict(headers_raw): def headers_dict_to_raw(headers_dict): + # type: (Optional[Dict[bytes, Union[bytes, List[bytes]]]]) -> Optional[bytes] r""" Returns a raw HTTP headers representation of headers @@ -79,6 +85,7 @@ def headers_dict_to_raw(headers_dict): def basic_auth_header(username, password, encoding='ISO-8859-1'): + # type: (String, String, String) -> bytes """ Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_ @@ -89,8 +96,7 @@ def basic_auth_header(username, password, encoding='ISO-8859-1'): .. _HTTP Basic Access Authentication (RFC 2617): http://www.ietf.org/rfc/rfc2617.txt """ - - auth = "%s:%s" % (username, password) + auth = "%s:%s" % (username, password) # type: Any if not isinstance(auth, bytes): # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 # seems to be the most widely used encoding here. See also: diff --git a/w3lib/url.py b/w3lib/url.py index f55d057d..e12b0a61 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -16,12 +16,16 @@ quote, parse_qs, parse_qsl, ParseResult, unquote, urlunparse) from six.moves.urllib.request import pathname2url, url2pathname +from typing import AnyStr, Union, Set, Sequence, TypeVar from w3lib.util import to_bytes, to_native_str, to_unicode +from w3lib._types import String + +T = TypeVar('T') # error handling function for bytes-to-Unicode decoding errors with URLs def _quote_byte(error): - return (to_unicode(quote(error.object[error.start:error.end])), error.end) + return to_unicode(quote(error.object[error.start:error.end])), error.end codecs.register_error('percentencode', _quote_byte) @@ -34,7 +38,9 @@ def _quote_byte(error): _safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%' + def safe_url_string(url, encoding='utf8', path_encoding='utf8'): + # type: (AnyStr, String, String) -> str """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. @@ -64,7 +70,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'): try: netloc = parts.netloc.encode('idna') except UnicodeError: - netloc = parts.netloc + netloc = parts.netloc # type: ignore # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) @@ -84,7 +90,9 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'): _parent_dirs = re.compile(r'/?(\.\./)+') + def safe_download_url(url): + # type: (str) -> str """ Make a url for download. This will call safe_url_string and then strip the fragment, if one exists. The path will be normalised. @@ -104,10 +112,12 @@ def safe_download_url(url): def is_url(text): + # type: (String) -> bool return text.partition("://")[0] in ('file', 'http', 'https') -def url_query_parameter(url, parameter, default=None, keep_blank_values=0): +def url_query_parameter(url, parameter, default=None, keep_blank_values=False): + # type: (str, String, T, bool) -> Union[str, T] """Return the value of a url parameter, given the url and parameter name General case: @@ -123,14 +133,14 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0): 'mydefault' >>> - Returns None if `keep_blank_values` not set or 0 (default): + Returns None if `keep_blank_values` not set or False (default): >>> w3lib.url.url_query_parameter("product.html?id=", "id") >>> - Returns an empty string if `keep_blank_values` set to 1: + Returns an empty string if `keep_blank_values` set to True: - >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1) + >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=True) '' >>> @@ -140,10 +150,14 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0): urlsplit(str(url))[3], keep_blank_values=keep_blank_values ) - return queryparams.get(parameter, [default])[0] + # mypy shows 'List item 0 has incompatible type "_T"' error + values = queryparams.get(parameter, [default]) # type: ignore + return values[0] + def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False): + # type: (str, Union[str, Sequence[str]], str, str, bool, bool, bool) -> str """Clean URL arguments leaving only those passed in the parameterlist keeping order >>> import w3lib.url @@ -179,7 +193,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u parameterlist = [parameterlist] url, fragment = urldefrag(url) base, _, query = url.partition('?') - seen = set() + seen = set() # type: Set[str] querylist = [] for ksv in query.split(sep): if not ksv: @@ -208,10 +222,11 @@ def _add_or_replace_parameters(url, params): new_args.update(params) query = urlencode(new_args) - return urlunsplit(parsed._replace(query=query)) - + # "SplitResult" has no attribute "_replace" - looks like a bug in typeshed + return urlunsplit(parsed._replace(query=query)) # type: ignore def add_or_replace_parameter(url, name, new_value): + # type: (str, str, str) -> str """Add or remove a parameter to a given url >>> import w3lib.url @@ -243,6 +258,7 @@ def add_or_replace_parameters(url, new_parameters): def path_to_file_uri(path): + # type: (str) -> str """Convert local filesystem path to legal File URIs as described in: http://en.wikipedia.org/wiki/File_URI_scheme """ @@ -253,6 +269,7 @@ def path_to_file_uri(path): def file_uri_to_path(uri): + # type: (str) -> str """Convert File URI to local filesystem path according to: http://en.wikipedia.org/wiki/File_URI_scheme """ @@ -261,6 +278,7 @@ def file_uri_to_path(uri): def any_to_uri(uri_or_path): + # type: (str) -> str """If given a path name, return its File URI, otherwise return it unmodified """ @@ -525,7 +543,7 @@ def parse_url(url, encoding=None): if not six.PY2: - from urllib.parse import _coerce_args, unquote_to_bytes + from urllib.parse import _coerce_args, unquote_to_bytes # type: ignore def parse_qsl_to_bytes(qs, keep_blank_values=False): """Parse a query given as a string argument. diff --git a/w3lib/util.py b/w3lib/util.py index d8513eef..3015525d 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -1,4 +1,8 @@ import six +from typing import AnyStr, Optional + +from ._types import String + def str_to_unicode(text, encoding=None, errors='strict'): if encoding is None: @@ -7,6 +11,7 @@ def str_to_unicode(text, encoding=None, errors='strict'): return text.decode(encoding, errors) return text + def unicode_to_str(text, encoding=None, errors='strict'): if encoding is None: encoding = 'utf-8' @@ -14,19 +19,23 @@ def unicode_to_str(text, encoding=None, errors='strict'): return text.encode(encoding, errors) return text + def to_unicode(text, encoding=None, errors='strict'): + # type: (AnyStr, Optional[String], String) -> six.text_type """Return the unicode representation of a bytes object `text`. If `text` is already an unicode object, return it as-is.""" if isinstance(text, six.text_type): return text - if not isinstance(text, (bytes, six.text_type)): + if not isinstance(text, bytes): raise TypeError('to_unicode must receive a bytes, str or unicode ' 'object, got %s' % type(text).__name__) if encoding is None: encoding = 'utf-8' return text.decode(encoding, errors) + def to_bytes(text, encoding=None, errors='strict'): + # type: (AnyStr, Optional[String], String) -> bytes """Return the binary representation of `text`. If `text` is already a bytes object, return it as-is.""" if isinstance(text, bytes): @@ -38,7 +47,9 @@ def to_bytes(text, encoding=None, errors='strict'): encoding = 'utf-8' return text.encode(encoding, errors) + def to_native_str(text, encoding=None, errors='strict'): + # type: (AnyStr, Optional[String], String) -> str """ Return str representation of `text` (bytes in Python 2.x and unicode in Python 3.x). """ if six.PY2: