Skip to content

Commit 95296a9

Browse files
pythongh-140875: Fix handling of unclosed charrefs before EOF in HTMLParser (pythonGH-140904)
1 parent afa0bad commit 95296a9

File tree

3 files changed

+109
-33
lines changed

3 files changed

+109
-33
lines changed

Lib/html/parser.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
2626
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
27+
incomplete_charref = re.compile('&#(?:[0-9]|[xX][0-9a-fA-F])')
2728
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
2829

2930
starttagopen = re.compile('<[a-zA-Z]')
@@ -304,10 +305,20 @@ def goahead(self, end):
304305
k = k - 1
305306
i = self.updatepos(i, k)
306307
continue
308+
match = incomplete_charref.match(rawdata, i)
309+
if match:
310+
if end:
311+
self.handle_charref(rawdata[i+2:])
312+
i = self.updatepos(i, n)
313+
break
314+
# incomplete
315+
break
316+
elif i + 3 < n: # larger than "&#x"
317+
# not the end of the buffer, and can't be confused
318+
# with some other construct
319+
self.handle_data("&#")
320+
i = self.updatepos(i, i + 2)
307321
else:
308-
if ";" in rawdata[i:]: # bail by consuming &#
309-
self.handle_data(rawdata[i:i+2])
310-
i = self.updatepos(i, i+2)
311322
break
312323
elif startswith('&', i):
313324
match = entityref.match(rawdata, i)
@@ -321,15 +332,13 @@ def goahead(self, end):
321332
continue
322333
match = incomplete.match(rawdata, i)
323334
if match:
324-
# match.group() will contain at least 2 chars
325-
if end and match.group() == rawdata[i:]:
326-
k = match.end()
327-
if k <= i:
328-
k = n
329-
i = self.updatepos(i, i + 1)
335+
if end:
336+
self.handle_entityref(rawdata[i+1:])
337+
i = self.updatepos(i, n)
338+
break
330339
# incomplete
331340
break
332-
elif (i + 1) < n:
341+
elif i + 1 < n:
333342
# not the end of the buffer, and can't be confused
334343
# with some other construct
335344
self.handle_data("&")

Lib/test/test_htmlparser.py

Lines changed: 87 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -109,12 +109,13 @@ def get_events(self):
109109

110110
class TestCaseBase(unittest.TestCase):
111111

112-
def get_collector(self):
113-
return EventCollector(convert_charrefs=False)
112+
def get_collector(self, convert_charrefs=False):
113+
return EventCollector(convert_charrefs=convert_charrefs)
114114

115-
def _run_check(self, source, expected_events, collector=None):
115+
def _run_check(self, source, expected_events,
116+
*, collector=None, convert_charrefs=False):
116117
if collector is None:
117-
collector = self.get_collector()
118+
collector = self.get_collector(convert_charrefs=convert_charrefs)
118119
parser = collector
119120
for s in source:
120121
parser.feed(s)
@@ -128,7 +129,7 @@ def _run_check(self, source, expected_events, collector=None):
128129

129130
def _run_check_extra(self, source, events):
130131
self._run_check(source, events,
131-
EventCollectorExtra(convert_charrefs=False))
132+
collector=EventCollectorExtra(convert_charrefs=False))
132133

133134

134135
class HTMLParserTestCase(TestCaseBase):
@@ -187,10 +188,87 @@ def test_malformatted_charref(self):
187188
])
188189

189190
def test_unclosed_entityref(self):
190-
self._run_check("&entityref foo", [
191-
("entityref", "entityref"),
192-
("data", " foo"),
193-
])
191+
self._run_check('&gt &lt;', [('entityref', 'gt'), ('data', ' '), ('entityref', 'lt')],
192+
convert_charrefs=False)
193+
self._run_check('&gt &lt;', [('data', '> <')], convert_charrefs=True)
194+
195+
self._run_check('&undefined &lt;',
196+
[('entityref', 'undefined'), ('data', ' '), ('entityref', 'lt')],
197+
convert_charrefs=False)
198+
self._run_check('&undefined &lt;', [('data', '&undefined <')],
199+
convert_charrefs=True)
200+
201+
self._run_check('&gtundefined &lt;',
202+
[('entityref', 'gtundefined'), ('data', ' '), ('entityref', 'lt')],
203+
convert_charrefs=False)
204+
self._run_check('&gtundefined &lt;', [('data', '>undefined <')],
205+
convert_charrefs=True)
206+
207+
self._run_check('& &lt;', [('data', '& '), ('entityref', 'lt')],
208+
convert_charrefs=False)
209+
self._run_check('& &lt;', [('data', '& <')], convert_charrefs=True)
210+
211+
def test_eof_in_entityref(self):
212+
self._run_check('&gt', [('entityref', 'gt')], convert_charrefs=False)
213+
self._run_check('&gt', [('data', '>')], convert_charrefs=True)
214+
215+
self._run_check('&g', [('entityref', 'g')], convert_charrefs=False)
216+
self._run_check('&g', [('data', '&g')], convert_charrefs=True)
217+
218+
self._run_check('&undefined', [('entityref', 'undefined')],
219+
convert_charrefs=False)
220+
self._run_check('&undefined', [('data', '&undefined')],
221+
convert_charrefs=True)
222+
223+
self._run_check('&gtundefined', [('entityref', 'gtundefined')],
224+
convert_charrefs=False)
225+
self._run_check('&gtundefined', [('data', '>undefined')],
226+
convert_charrefs=True)
227+
228+
self._run_check('&', [('data', '&')], convert_charrefs=False)
229+
self._run_check('&', [('data', '&')], convert_charrefs=True)
230+
231+
def test_unclosed_charref(self):
232+
self._run_check('&#123 &lt;', [('charref', '123'), ('data', ' '), ('entityref', 'lt')],
233+
convert_charrefs=False)
234+
self._run_check('&#123 &lt;', [('data', '{ <')], convert_charrefs=True)
235+
self._run_check('&#xab &lt;', [('charref', 'xab'), ('data', ' '), ('entityref', 'lt')],
236+
convert_charrefs=False)
237+
self._run_check('&#xab &lt;', [('data', '\xab <')], convert_charrefs=True)
238+
239+
self._run_check('&#123456789 &lt;',
240+
[('charref', '123456789'), ('data', ' '), ('entityref', 'lt')],
241+
convert_charrefs=False)
242+
self._run_check('&#123456789 &lt;', [('data', '\ufffd <')],
243+
convert_charrefs=True)
244+
self._run_check('&#x123456789 &lt;',
245+
[('charref', 'x123456789'), ('data', ' '), ('entityref', 'lt')],
246+
convert_charrefs=False)
247+
self._run_check('&#x123456789 &lt;', [('data', '\ufffd <')],
248+
convert_charrefs=True)
249+
250+
self._run_check('&# &lt;', [('data', '&# '), ('entityref', 'lt')], convert_charrefs=False)
251+
self._run_check('&# &lt;', [('data', '&# <')], convert_charrefs=True)
252+
self._run_check('&#x &lt;', [('data', '&#x '), ('entityref', 'lt')], convert_charrefs=False)
253+
self._run_check('&#x &lt;', [('data', '&#x <')], convert_charrefs=True)
254+
255+
def test_eof_in_charref(self):
256+
self._run_check('&#123', [('charref', '123')], convert_charrefs=False)
257+
self._run_check('&#123', [('data', '{')], convert_charrefs=True)
258+
self._run_check('&#xab', [('charref', 'xab')], convert_charrefs=False)
259+
self._run_check('&#xab', [('data', '\xab')], convert_charrefs=True)
260+
261+
self._run_check('&#123456789', [('charref', '123456789')],
262+
convert_charrefs=False)
263+
self._run_check('&#123456789', [('data', '\ufffd')], convert_charrefs=True)
264+
self._run_check('&#x123456789', [('charref', 'x123456789')],
265+
convert_charrefs=False)
266+
self._run_check('&#x123456789', [('data', '\ufffd')], convert_charrefs=True)
267+
268+
self._run_check('&#', [('data', '&#')], convert_charrefs=False)
269+
self._run_check('&#', [('data', '&#')], convert_charrefs=True)
270+
self._run_check('&#x', [('data', '&#x')], convert_charrefs=False)
271+
self._run_check('&#x', [('data', '&#x')], convert_charrefs=True)
194272

195273
def test_bad_nesting(self):
196274
# Strangely, this *is* supposed to test that overlapping
@@ -762,20 +840,6 @@ def test_correct_detection_of_start_tags(self):
762840
]
763841
self._run_check(html, expected)
764842

765-
def test_EOF_in_charref(self):
766-
# see #17802
767-
# This test checks that the UnboundLocalError reported in the issue
768-
# is not raised, however I'm not sure the returned values are correct.
769-
# Maybe HTMLParser should use self.unescape for these
770-
data = [
771-
('a&', [('data', 'a&')]),
772-
('a&b', [('data', 'ab')]),
773-
('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]),
774-
('a&b;', [('data', 'a'), ('entityref', 'b')]),
775-
]
776-
for html, expected in data:
777-
self._run_check(html, expected)
778-
779843
def test_eof_in_comments(self):
780844
data = [
781845
('<!--', [('comment', '')]),
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix handling of unclosed character references (named and numerical)
2+
followed by the end of file in :class:`html.parser.HTMLParser` with
3+
``convert_charrefs=False``.

0 commit comments

Comments
 (0)