Skip to content

Commit 1cfc0d1

Browse files
committed
httputil: Centralize regexes based directly on RFCs
This will make it easier to stay in strict conformance with the RFCs. Note that this commit makes a few small semantic changes to response start-line parsing: status codes must be exactly three digits, and control characters are not allowed in reason phrases.
1 parent f3a9bd4 commit 1cfc0d1

File tree

1 file changed

+39
-12
lines changed

1 file changed

+39
-12
lines changed

tornado/httputil.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,32 @@
7171
# To be used with str.strip() and related methods.
7272
HTTP_WHITESPACE = " \t"
7373

74-
HTTP_TOKEN_RE = re.compile(r"^[!#$%&'*+\-.^_`|~0-9A-Za-z]+$")
74+
75+
class _ABNF:
76+
"""Class that holds a subset of ABNF rules from RFC 9110 and friends.
77+
78+
Class attributes are re.Pattern objects, with the same name as in the RFC
79+
(with hyphens changed to underscores). Currently contains only the subset
80+
we use (which is why this class is not public). Unfortunately the fields
81+
cannot be alphabetized as they are in the RFCs because of dependencies.
82+
"""
83+
84+
# RFC 5234 (ABNF)
85+
VCHAR = re.compile(r"[\x21-\x7E]")
86+
87+
# RFC 9110 (HTTP Semantics)
88+
obs_text = re.compile(r"[\x80-\xFF]")
89+
tchar = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]")
90+
token = re.compile(rf"{tchar.pattern}+")
91+
field_name = token
92+
93+
# RFC 9112 (HTTP/1.1)
94+
HTTP_version = re.compile(r"HTTP/[0-9]\.[0-9]")
95+
reason_phrase = re.compile(rf"(?:[\t ]|{VCHAR.pattern}|{obs_text.pattern})+")
96+
status_code = re.compile(r"[0-9]{3}")
97+
status_line = re.compile(
98+
rf"({HTTP_version.pattern}) ({status_code.pattern}) ({reason_phrase.pattern})?"
99+
)
75100

76101

77102
@lru_cache(1000)
@@ -145,7 +170,7 @@ def __init__(self, *args: typing.Any, **kwargs: str) -> None: # noqa: F811
145170

146171
def add(self, name: str, value: str) -> None:
147172
"""Adds a new value for the given key."""
148-
if not HTTP_TOKEN_RE.match(name):
173+
if not _ABNF.token.fullmatch(name):
149174
raise HTTPInputError("Invalid header name %r" % name)
150175
norm_name = _normalize_header(name)
151176
self._last_key = norm_name
@@ -892,9 +917,6 @@ class RequestStartLine(typing.NamedTuple):
892917
version: str
893918

894919

895-
_http_version_re = re.compile(r"^HTTP/1\.[0-9]$")
896-
897-
898920
def parse_request_start_line(line: str) -> RequestStartLine:
899921
"""Returns a (method, path, version) tuple for an HTTP 1.x request line.
900922
@@ -909,10 +931,15 @@ def parse_request_start_line(line: str) -> RequestStartLine:
909931
# https://tools.ietf.org/html/rfc7230#section-3.1.1
910932
# invalid request-line SHOULD respond with a 400 (Bad Request)
911933
raise HTTPInputError("Malformed HTTP request line")
912-
if not _http_version_re.match(version):
934+
if not _ABNF.HTTP_version.fullmatch(version):
913935
raise HTTPInputError(
914936
"Malformed HTTP version in HTTP Request-Line: %r" % version
915937
)
938+
if not version.startswith("HTTP/1"):
939+
# HTTP/2 and above doesn't use parse_request_start_line.
940+
# This could be folded into the regex but we don't want to deviate
941+
# from the ABNF in the RFCs.
942+
raise HTTPInputError("Unexpected HTTP version %r" % version)
916943
return RequestStartLine(method, path, version)
917944

918945

@@ -922,9 +949,6 @@ class ResponseStartLine(typing.NamedTuple):
922949
reason: str
923950

924951

925-
_http_response_line_re = re.compile(r"(HTTP/1.[0-9]) ([0-9]+) ([^\r]*)")
926-
927-
928952
def parse_response_start_line(line: str) -> ResponseStartLine:
929953
"""Returns a (version, code, reason) tuple for an HTTP 1.x response line.
930954
@@ -933,11 +957,14 @@ def parse_response_start_line(line: str) -> ResponseStartLine:
933957
>>> parse_response_start_line("HTTP/1.1 200 OK")
934958
ResponseStartLine(version='HTTP/1.1', code=200, reason='OK')
935959
"""
936-
line = native_str(line)
937-
match = _http_response_line_re.match(line)
960+
match = _ABNF.status_line.fullmatch(line)
938961
if not match:
939962
raise HTTPInputError("Error parsing response start line")
940-
return ResponseStartLine(match.group(1), int(match.group(2)), match.group(3))
963+
r = ResponseStartLine(match.group(1), int(match.group(2)), match.group(3))
964+
if not r.version.startswith("HTTP/1"):
965+
# HTTP/2 and above doesn't use parse_response_start_line.
966+
raise HTTPInputError("Unexpected HTTP version %r" % r.version)
967+
return r
941968

942969

943970
# _parseparam and _parse_header are copied and modified from python2.7's cgi.py

0 commit comments

Comments
 (0)