Skip to content

Commit d748292

Browse files
authored
Merge branch 'main' into fix-6876
2 parents e7952dd + c2bb3f9 commit d748292

File tree

15 files changed

+681
-350
lines changed

15 files changed

+681
-350
lines changed

Doc/library/stdtypes.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1891,12 +1891,15 @@ expression support in the :mod:`re` module).
18911891
(``\n``) or return (``\r``), it is copied and the current column is reset to
18921892
zero. Any other character is copied unchanged and the current column is
18931893
incremented by one regardless of how the character is represented when
1894-
printed.
1894+
printed. For example::
18951895

18961896
>>> '01\t012\t0123\t01234'.expandtabs()
18971897
'01 012 0123 01234'
18981898
>>> '01\t012\t0123\t01234'.expandtabs(4)
18991899
'01 012 0123 01234'
1900+
>>> print('01\t012\n0123\t01234'.expandtabs(4))
1901+
01 012
1902+
0123 01234
19001903

19011904

19021905
.. method:: str.find(sub[, start[, end]])

Include/internal/pycore_crossinterp.h

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -303,10 +303,10 @@ typedef struct _excinfo {
303303
const char *errdisplay;
304304
} _PyXI_excinfo;
305305

306-
PyAPI_FUNC(int) _PyXI_InitExcInfo(_PyXI_excinfo *info, PyObject *exc);
306+
PyAPI_FUNC(_PyXI_excinfo *) _PyXI_NewExcInfo(PyObject *exc);
307+
PyAPI_FUNC(void) _PyXI_FreeExcInfo(_PyXI_excinfo *info);
307308
PyAPI_FUNC(PyObject *) _PyXI_FormatExcInfo(_PyXI_excinfo *info);
308309
PyAPI_FUNC(PyObject *) _PyXI_ExcInfoAsObject(_PyXI_excinfo *info);
309-
PyAPI_FUNC(void) _PyXI_ClearExcInfo(_PyXI_excinfo *info);
310310

311311

312312
typedef enum error_code {
@@ -322,19 +322,20 @@ typedef enum error_code {
322322
_PyXI_ERR_NOT_SHAREABLE = -9,
323323
} _PyXI_errcode;
324324

325+
typedef struct xi_failure _PyXI_failure;
325326

326-
typedef struct _sharedexception {
327-
// The originating interpreter.
328-
PyInterpreterState *interp;
329-
// The kind of error to propagate.
330-
_PyXI_errcode code;
331-
// The exception information to propagate, if applicable.
332-
// This is populated only for some error codes,
333-
// but always for _PyXI_ERR_UNCAUGHT_EXCEPTION.
334-
_PyXI_excinfo uncaught;
335-
} _PyXI_error;
327+
PyAPI_FUNC(_PyXI_failure *) _PyXI_NewFailure(void);
328+
PyAPI_FUNC(void) _PyXI_FreeFailure(_PyXI_failure *);
329+
PyAPI_FUNC(_PyXI_errcode) _PyXI_GetFailureCode(_PyXI_failure *);
330+
PyAPI_FUNC(int) _PyXI_InitFailure(_PyXI_failure *, _PyXI_errcode, PyObject *);
331+
PyAPI_FUNC(void) _PyXI_InitFailureUTF8(
332+
_PyXI_failure *,
333+
_PyXI_errcode,
334+
const char *);
336335

337-
PyAPI_FUNC(PyObject *) _PyXI_ApplyError(_PyXI_error *err);
336+
PyAPI_FUNC(int) _PyXI_UnwrapNotShareableError(
337+
PyThreadState *,
338+
_PyXI_failure *);
338339

339340

340341
// A cross-interpreter session involves entering an interpreter
@@ -366,19 +367,21 @@ PyAPI_FUNC(int) _PyXI_Enter(
366367
_PyXI_session_result *);
367368
PyAPI_FUNC(int) _PyXI_Exit(
368369
_PyXI_session *,
369-
_PyXI_errcode,
370+
_PyXI_failure *,
370371
_PyXI_session_result *);
371372

372373
PyAPI_FUNC(PyObject *) _PyXI_GetMainNamespace(
373374
_PyXI_session *,
374-
_PyXI_errcode *);
375+
_PyXI_failure *);
375376

376377
PyAPI_FUNC(int) _PyXI_Preserve(
377378
_PyXI_session *,
378379
const char *,
379380
PyObject *,
380-
_PyXI_errcode *);
381-
PyAPI_FUNC(PyObject *) _PyXI_GetPreserved(_PyXI_session_result *, const char *);
381+
_PyXI_failure *);
382+
PyAPI_FUNC(PyObject *) _PyXI_GetPreserved(
383+
_PyXI_session_result *,
384+
const char *);
382385

383386

384387
/*************/

Lib/html/parser.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
2828

2929
starttagopen = re.compile('<[a-zA-Z]')
30+
endtagopen = re.compile('</[a-zA-Z]')
3031
piclose = re.compile('>')
3132
commentclose = re.compile(r'--\s*>')
3233
# Note:
@@ -195,25 +196,43 @@ def goahead(self, end):
195196
k = self.parse_pi(i)
196197
elif startswith("<!", i):
197198
k = self.parse_html_declaration(i)
198-
elif (i + 1) < n:
199+
elif (i + 1) < n or end:
199200
self.handle_data("<")
200201
k = i + 1
201202
else:
202203
break
203204
if k < 0:
204205
if not end:
205206
break
206-
k = rawdata.find('>', i + 1)
207-
if k < 0:
208-
k = rawdata.find('<', i + 1)
209-
if k < 0:
210-
k = i + 1
211-
else:
212-
k += 1
213-
if self.convert_charrefs and not self.cdata_elem:
214-
self.handle_data(unescape(rawdata[i:k]))
207+
if starttagopen.match(rawdata, i): # < + letter
208+
pass
209+
elif startswith("</", i):
210+
if i + 2 == n:
211+
self.handle_data("</")
212+
elif endtagopen.match(rawdata, i): # </ + letter
213+
pass
214+
else:
215+
# bogus comment
216+
self.handle_comment(rawdata[i+2:])
217+
elif startswith("<!--", i):
218+
j = n
219+
for suffix in ("--!", "--", "-"):
220+
if rawdata.endswith(suffix, i+4):
221+
j -= len(suffix)
222+
break
223+
self.handle_comment(rawdata[i+4:j])
224+
elif startswith("<![CDATA[", i):
225+
self.unknown_decl(rawdata[i+3:])
226+
elif rawdata[i:i+9].lower() == '<!doctype':
227+
self.handle_decl(rawdata[i+2:])
228+
elif startswith("<!", i):
229+
# bogus comment
230+
self.handle_comment(rawdata[i+2:])
231+
elif startswith("<?", i):
232+
self.handle_pi(rawdata[i+2:])
215233
else:
216-
self.handle_data(rawdata[i:k])
234+
raise AssertionError("we should not get here!")
235+
k = n
217236
i = self.updatepos(i, k)
218237
elif startswith("&#", i):
219238
match = charref.match(rawdata, i)

Lib/test/_code_definitions.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,13 @@ def spam_with_globals_and_builtins():
5757
print(res)
5858

5959

60+
def spam_with_global_and_attr_same_name():
61+
try:
62+
spam_minimal.spam_minimal
63+
except AttributeError:
64+
pass
65+
66+
6067
def spam_full_args(a, b, /, c, d, *args, e, f, **kwargs):
6168
return (a, b, c, d, e, f, args, kwargs)
6269

@@ -190,6 +197,7 @@ def ham_C_closure(z):
190197
spam_minimal,
191198
spam_with_builtins,
192199
spam_with_globals_and_builtins,
200+
spam_with_global_and_attr_same_name,
193201
spam_full_args,
194202
spam_full_args_with_defaults,
195203
spam_args_attrs_and_builtins,
@@ -258,6 +266,7 @@ def ham_C_closure(z):
258266
script_with_globals,
259267
spam_full_args_with_defaults,
260268
spam_with_globals_and_builtins,
269+
spam_with_global_and_attr_same_name,
261270
spam_full,
262271
]
263272

@@ -275,6 +284,7 @@ def ham_C_closure(z):
275284
*PURE_SCRIPT_FUNCTIONS,
276285
script_with_globals,
277286
spam_with_globals_and_builtins,
287+
spam_with_global_and_attr_same_name,
278288
]
279289

280290

Lib/test/test_code.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,7 @@ def test_local_kinds(self):
701701
'checks': CO_FAST_LOCAL,
702702
'res': CO_FAST_LOCAL,
703703
},
704+
defs.spam_with_global_and_attr_same_name: {},
704705
defs.spam_full_args: {
705706
'a': POSONLY,
706707
'b': POSONLY,
@@ -955,6 +956,10 @@ def new_var_counts(*,
955956
purelocals=5,
956957
globalvars=6,
957958
),
959+
defs.spam_with_global_and_attr_same_name: new_var_counts(
960+
globalvars=2,
961+
attrs=1,
962+
),
958963
defs.spam_full_args: new_var_counts(
959964
posonly=2,
960965
posorkw=2,

Lib/test/test_fstring.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1380,7 +1380,7 @@ def test_conversions(self):
13801380
for conv in ' s', ' s ':
13811381
self.assertAllRaise(SyntaxError,
13821382
"f-string: conversion type must come right after the"
1383-
" exclamanation mark",
1383+
" exclamation mark",
13841384
["f'{3!" + conv + "}'"])
13851385

13861386
self.assertAllRaise(SyntaxError,

Lib/test/test_htmlparser.py

Lines changed: 77 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import unittest
66

77
from unittest.mock import patch
8+
from test import support
89

910

1011
class EventCollector(html.parser.HTMLParser):
@@ -430,28 +431,34 @@ def test_tolerant_parsing(self):
430431
('data', '<'),
431432
('starttag', 'bc<', [('a', None)]),
432433
('endtag', 'html'),
433-
('data', '\n<img src="URL>'),
434-
('comment', '/img'),
435-
('endtag', 'html<')])
434+
('data', '\n')])
436435

437436
def test_starttag_junk_chars(self):
437+
self._run_check("<", [('data', '<')])
438+
self._run_check("<>", [('data', '<>')])
439+
self._run_check("< >", [('data', '< >')])
440+
self._run_check("< ", [('data', '< ')])
438441
self._run_check("</>", [])
442+
self._run_check("<$>", [('data', '<$>')])
439443
self._run_check("</$>", [('comment', '$')])
440444
self._run_check("</", [('data', '</')])
441-
self._run_check("</a", [('data', '</a')])
445+
self._run_check("</a", [])
446+
self._run_check("</ a>", [('endtag', 'a')])
447+
self._run_check("</ a", [('comment', ' a')])
442448
self._run_check("<a<a>", [('starttag', 'a<a', [])])
443449
self._run_check("</a<a>", [('endtag', 'a<a')])
444-
self._run_check("<!", [('data', '<!')])
445-
self._run_check("<a", [('data', '<a')])
446-
self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
447-
self._run_check("<a foo='bar", [('data', "<a foo='bar")])
448-
self._run_check("<a foo='>'", [('data', "<a foo='>'")])
449-
self._run_check("<a foo='>", [('data', "<a foo='>")])
450+
self._run_check("<!", [('comment', '')])
451+
self._run_check("<a", [])
452+
self._run_check("<a foo='bar'", [])
453+
self._run_check("<a foo='bar", [])
454+
self._run_check("<a foo='>'", [])
455+
self._run_check("<a foo='>", [])
450456
self._run_check("<a$>", [('starttag', 'a$', [])])
451457
self._run_check("<a$b>", [('starttag', 'a$b', [])])
452458
self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
453459
self._run_check("<a$b >", [('starttag', 'a$b', [])])
454460
self._run_check("<a$b />", [('startendtag', 'a$b', [])])
461+
self._run_check("</a$b>", [('endtag', 'a$b')])
455462

456463
def test_slashes_in_starttag(self):
457464
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
@@ -576,21 +583,50 @@ def test_EOF_in_charref(self):
576583
for html, expected in data:
577584
self._run_check(html, expected)
578585

579-
def test_EOF_in_comments_or_decls(self):
586+
def test_eof_in_comments(self):
580587
data = [
581-
('<!', [('data', '<!')]),
582-
('<!-', [('data', '<!-')]),
583-
('<!--', [('data', '<!--')]),
584-
('<![', [('data', '<![')]),
585-
('<![CDATA[', [('data', '<![CDATA[')]),
586-
('<![CDATA[x', [('data', '<![CDATA[x')]),
587-
('<!DOCTYPE', [('data', '<!DOCTYPE')]),
588-
('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
588+
('<!--', [('comment', '')]),
589+
('<!---', [('comment', '')]),
590+
('<!----', [('comment', '')]),
591+
('<!-----', [('comment', '-')]),
592+
('<!------', [('comment', '--')]),
593+
('<!----!', [('comment', '')]),
594+
('<!---!', [('comment', '-!')]),
595+
('<!---!>', [('comment', '-!>')]),
596+
('<!--foo', [('comment', 'foo')]),
597+
('<!--foo-', [('comment', 'foo')]),
598+
('<!--foo--', [('comment', 'foo')]),
599+
('<!--foo--!', [('comment', 'foo')]),
600+
('<!--<!--', [('comment', '<!')]),
601+
('<!--<!--!', [('comment', '<!')]),
589602
]
590603
for html, expected in data:
591604
self._run_check(html, expected)
605+
606+
def test_eof_in_declarations(self):
607+
data = [
608+
('<!', [('comment', '')]),
609+
('<!-', [('comment', '-')]),
610+
('<![', [('comment', '[')]),
611+
('<![CDATA[', [('unknown decl', 'CDATA[')]),
612+
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
613+
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
614+
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
615+
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
616+
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
617+
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
618+
('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
619+
('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
620+
('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
621+
('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
622+
[('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
623+
]
624+
for html, expected in data:
625+
self._run_check(html, expected)
626+
592627
def test_bogus_comments(self):
593-
html = ('<! not really a comment >'
628+
html = ('<!ELEMENT br EMPTY>'
629+
'<! not really a comment >'
594630
'<! not a comment either -->'
595631
'<! -- close enough -->'
596632
'<!><!<-- this was an empty comment>'
@@ -604,6 +640,7 @@ def test_bogus_comments(self):
604640
'<![CDATA]]>' # required '[' after CDATA
605641
)
606642
expected = [
643+
('comment', 'ELEMENT br EMPTY'),
607644
('comment', ' not really a comment '),
608645
('comment', ' not a comment either --'),
609646
('comment', ' -- close enough --'),
@@ -684,6 +721,26 @@ def test_convert_charrefs_dropped_text(self):
684721
('endtag', 'a'), ('data', ' bar & baz')]
685722
)
686723

724+
@support.requires_resource('cpu')
725+
def test_eof_no_quadratic_complexity(self):
726+
# Each of these examples used to take about an hour.
727+
# Now they take a fraction of a second.
728+
def check(source):
729+
parser = html.parser.HTMLParser()
730+
parser.feed(source)
731+
parser.close()
732+
n = 120_000
733+
check("<a " * n)
734+
check("<a a=" * n)
735+
check("</a " * 14 * n)
736+
check("</a a=" * 11 * n)
737+
check("<!--" * 4 * n)
738+
check("<!" * 60 * n)
739+
check("<?" * 19 * n)
740+
check("</$" * 15 * n)
741+
check("<![CDATA[" * 9 * n)
742+
check("<!doctype" * 35 * n)
743+
687744

688745
class AttributesTestCase(TestCaseBase):
689746

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix typo in the f-string conversion type error ("exclamanation" -> "exclamation").
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix quadratic complexity in processing specially crafted input in
2+
:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
3+
to the HTML5 specs -- comments and declarations are automatically closed,
4+
tags are ignored.

0 commit comments

Comments
 (0)