Merge pull request #3489 from bdarnell/field-value

bdarnell · web-flow · commit bb614003a390 · 2025-04-28T12:57:35.000-04:00
httputil: Forbid control chars and CR in header values
diff --git a/tornado/httputil.py b/tornado/httputil.py
@@ -33,7 +33,7 @@
 import unicodedata
 from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
 
-from tornado.escape import native_str, parse_qs_bytes, utf8
+from tornado.escape import native_str, parse_qs_bytes, utf8, to_unicode
 from tornado.log import gen_log
 from tornado.util import ObjectDict, unicode_type
 
@@ -100,6 +100,12 @@ class _ABNF:
     # RFC 9110 (HTTP Semantics)
     obs_text = re.compile(r"[\x80-\xFF]")
     field_vchar = re.compile(rf"(?:{VCHAR.pattern}|{obs_text.pattern})")
+    # Not exactly from the RFC to simplify and combine field-content and field-value.
+    field_value = re.compile(
+        rf"|"
+        rf"{field_vchar.pattern}|"
+        rf"{field_vchar.pattern}(?:{field_vchar.pattern}| |\t)*{field_vchar.pattern}"
+    )
     tchar = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]")
     token = re.compile(rf"{tchar.pattern}+")
     field_name = token
@@ -195,6 +201,10 @@ def add(self, name: str, value: str) -> None:
         """Adds a new value for the given key."""
         if not _ABNF.field_name.fullmatch(name):
             raise HTTPInputError("Invalid header name %r" % name)
+        if not _ABNF.field_value.fullmatch(to_unicode(value)):
+            # TODO: the fact we still support bytes here (contrary to type annotations)
+            # and still test for it should probably be changed.
+            raise HTTPInputError("Invalid header value %r" % value)
         norm_name = _normalize_header(name)
         self._last_key = norm_name
         if norm_name in self:
@@ -254,6 +264,8 @@ def parse_line(self, line: str) -> None:
             if self._last_key is None:
                 raise HTTPInputError("first header line cannot start with whitespace")
             new_part = " " + line.strip(HTTP_WHITESPACE)
+            if not _ABNF.field_value.fullmatch(new_part[1:]):
+                raise HTTPInputError("Invalid header continuation %r" % new_part)
             self._as_list[self._last_key][-1] += new_part
             self._dict[self._last_key] += new_part
         else:
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py
@@ -303,6 +303,17 @@ def test_continuation(self):
         data = "Foo: bar\r\n\fasdf"
         self.assertRaises(HTTPInputError, HTTPHeaders.parse, data)
 
+    def test_forbidden_ascii_characters(self):
+        # Control characters and ASCII whitespace other than space, tab, and CRLF are not allowed in
+        # headers.
+        for c in range(0xFF):
+            data = f"Foo: bar{chr(c)}baz\r\n"
+            if c == 0x09 or (c >= 0x20 and c != 0x7F):
+                headers = HTTPHeaders.parse(data)
+                self.assertEqual(headers["Foo"], f"bar{chr(c)}baz")
+            else:
+                self.assertRaises(HTTPInputError, HTTPHeaders.parse, data)
+
     def test_unicode_newlines(self):
         # Ensure that only \r\n is recognized as a header separator, and not
         # the other newline-like unicode characters.
@@ -311,10 +322,13 @@ def test_unicode_newlines(self):
         # and cpython's unicodeobject.c (which defines the implementation
         # of unicode_type.splitlines(), and uses a different list than TR13).
         newlines = [
-            "\u001b",  # VERTICAL TAB
-            "\u001c",  # FILE SEPARATOR
-            "\u001d",  # GROUP SEPARATOR
-            "\u001e",  # RECORD SEPARATOR
+            # The following ascii characters are sometimes treated as newline-like,
+            # but they're disallowed in HTTP headers. This test covers unicode
+            # characters that are permitted in headers (under the obs-text rule).
+            # "\u001b",  # VERTICAL TAB
+            # "\u001c",  # FILE SEPARATOR
+            # "\u001d",  # GROUP SEPARATOR
+            # "\u001e",  # RECORD SEPARATOR
             "\u0085",  # NEXT LINE
             "\u2028",  # LINE SEPARATOR
             "\u2029",  # PARAGRAPH SEPARATOR
@@ -363,13 +377,16 @@ def test_unicode_whitespace(self):
             self.assertEqual(expected, list(headers.get_all()))
 
     def test_optional_cr(self):
+        # Bare CR is  not a valid line separator
+        with self.assertRaises(HTTPInputError):
+            HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n")
+
         # Both CRLF and LF should be accepted as separators. CR should not be
-        # part of the data when followed by LF, but it is a normal char
-        # otherwise (or should bare CR be an error?)
-        headers = HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n")
+        # part of the data when followed by LF.
+        headers = HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nMore: more\r\n")
         self.assertEqual(
             sorted(headers.get_all()),
-            [("Cr", "cr\rMore: more"), ("Crlf", "crlf"), ("Lf", "lf")],
+            [("Crlf", "crlf"), ("Lf", "lf"), ("More", "more")],
         )
 
     def test_copy(self):