Skip to content

Commit 0808248

Browse files
authored
Merge pull request #3554 from bdarnell/parseparam
httputil: Fix quadratic behavior in _parseparam
2 parents f33ce9f + 03198b5 commit 0808248

File tree

2 files changed

+45
-7
lines changed

2 files changed

+45
-7
lines changed

tornado/httputil.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,19 +1096,34 @@ def parse_response_start_line(line: str) -> ResponseStartLine:
10961096
# It has also been modified to support valueless parameters as seen in
10971097
# websocket extension negotiations, and to support non-ascii values in
10981098
# RFC 2231/5987 format.
1099+
#
1100+
# _parseparam has been further modified with the logic from
1101+
# https://github.com/python/cpython/pull/136072/files
1102+
# to avoid quadratic behavior when parsing semicolons in quoted strings.
1103+
#
1104+
# TODO: See if we can switch to email.message.Message for this functionality.
1105+
# This is the suggested replacement for the cgi.py module now that cgi has
1106+
# been removed from recent versions of Python. We need to verify that
1107+
# the email module is consistent with our existing behavior (and all relevant
1108+
# RFCs for multipart/form-data) before making this change.
10991109

11001110

11011111
def _parseparam(s: str) -> Generator[str, None, None]:
1102-
while s[:1] == ";":
1103-
s = s[1:]
1104-
end = s.find(";")
1105-
while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2:
1106-
end = s.find(";", end + 1)
1112+
start = 0
1113+
while s.find(";", start) == start:
1114+
start += 1
1115+
end = s.find(";", start)
1116+
ind, diff = start, 0
1117+
while end > 0:
1118+
diff += s.count('"', ind, end) - s.count('\\"', ind, end)
1119+
if diff % 2 == 0:
1120+
break
1121+
end, ind = ind, s.find(";", end + 1)
11071122
if end < 0:
11081123
end = len(s)
1109-
f = s[:end]
1124+
f = s[start:end]
11101125
yield f.strip()
1111-
s = s[end:]
1126+
start = end
11121127

11131128

11141129
def _parse_header(line: str) -> Tuple[str, Dict[str, str]]:

tornado/test/httputil_test.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,29 @@ def test_data_after_final_boundary(self):
279279
self.assertEqual(file["filename"], "ab.txt")
280280
self.assertEqual(file["body"], b"Foo")
281281

282+
def test_disposition_param_linear_performance(self):
283+
# This is a regression test for performance of parsing parameters
284+
# to the content-disposition header, specifically for semicolons within
285+
# quoted strings.
286+
def f(n):
287+
start = time.time()
288+
message = (
289+
b"--1234\r\nContent-Disposition: form-data; "
290+
+ b'x="'
291+
+ b";" * n
292+
+ b'"; '
293+
+ b'name="files"; filename="a.txt"\r\n\r\nFoo\r\n--1234--\r\n'
294+
)
295+
args: dict[str, list[bytes]] = {}
296+
files: dict[str, list[HTTPFile]] = {}
297+
parse_multipart_form_data(b"1234", message, args, files)
298+
return time.time() - start
299+
300+
d1 = f(1_000)
301+
d2 = f(10_000)
302+
if d2 / d1 > 20:
303+
self.fail(f"Disposition param parsing is not linear: {d1=} vs {d2=}")
304+
282305

283306
class HTTPHeadersTest(unittest.TestCase):
284307
def test_multi_line(self):

0 commit comments

Comments
 (0)