Skip to content
25 changes: 23 additions & 2 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
escapable_raw_text_close = re.compile('</(title|textarea)>', re.I)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it even used?

commentclose = re.compile(r'--\s*>')
# Note:
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
Expand Down Expand Up @@ -100,6 +101,7 @@ class HTMLParser(_markupbase.ParserBase):
"""

CDATA_CONTENT_ELEMENTS = ("script", "style")
ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea")

def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
Expand All @@ -117,6 +119,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self.escapable_raw_text_elem = None
super().reset()

def feed(self, data):
Expand All @@ -138,6 +141,14 @@ def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text

def set_escapable_raw_text_mode(self, elem):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the behavior for raw text elements and escapable raw text elements is so similar, and they cannot be nested, why not use set_cdata_mode() and cdata_elem for both? Just add an optional boolean parameter to specify whether it is escapable (charrefs should be unescaped) or not.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@serhiy-storchaka I can do that.

self.escapable_raw_text_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.escapable_raw_text_elem, re.I)

def clear_escapable_raw_text_mode(self):
self.interesting = interesting_normal
self.escapable_raw_text_elem = None

def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Expand All @@ -154,7 +165,7 @@ def goahead(self, end):
i = 0
n = len(rawdata)
while i < n:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
Expand All @@ -173,11 +184,13 @@ def goahead(self, end):
if match:
j = match.start()
else:
if self.escapable_raw_text_elem:
break
if self.cdata_elem:
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is incorrect. Charrefs should be resolved in an escapable raw text element. Except an ambiguous ampersand.

We need also tests for convert_charrefs=False in an escapable raw text element.

self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
Expand Down Expand Up @@ -354,6 +367,8 @@ def parse_starttag(self, i):
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS:
self.set_escapable_raw_text_mode(tag)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
Expand Down Expand Up @@ -429,8 +444,14 @@ def parse_endtag(self, i):
self.handle_data(rawdata[i:gtpos])
return gtpos

if self.escapable_raw_text_elem is not None: # title or textarea
if elem != self.escapable_raw_text_elem:
self.handle_data(rawdata[i:gtpos])
return gtpos

self.handle_endtag(elem)
self.clear_cdata_mode()
self.clear_escapable_raw_text_mode()
return gtpos

# Overridable -- finish processing of start+end tag: <tag.../>
Expand Down
42 changes: 39 additions & 3 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,9 @@ def test_cdata_content(self):
#'foo = </\nscript>',
#'foo = </ script>',
]
elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
tags = ['script', 'style', 'textarea', 'title']
# test the following 'casing' for each tag: script, SCRIPT, Script etc.
elements = [f(tag) for tag in tags for f in (str.lower, str.upper, str.capitalize)]
for content in contents:
for element in elements:
element_lower = element.lower()
Expand Down Expand Up @@ -317,6 +319,34 @@ def get_events(self):
("endtag", element_lower)],
collector=Collector(convert_charrefs=False))

def test_escapable_raw_text_content(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this test differ from test_cdata_content? BTW, most examples use JavaScript syntax, and only relevant for <script>.

contents = [
'foo = "</TITLE" + ">";',
'foo = <\n/title> ',
'<!-- document.write("</scr" + "ipt>"); -->',
'\n//<![CDATA[\n'
'\n<!-- //\nvar foo = 3.14;\n// -->\n',
# valid character reference
'&#65;',
# ambiguous ampersand example
'&notaref',
'foo = "</sty" + "le>";',
'<!-- \u2603 -->',
# these two should be invalid according to the HTML 5 spec,
# section 8.1.2.2
#'foo = </\nscript>',
#'foo = </ script>',
]
elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea']
for content in contents:
for element in elements:
element_lower = element.lower()
s = '<{element}>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", element_lower, []),
("data", content),
("endtag", element_lower)])

def test_EOF_in_cdata(self):
content = """<!-- not a comment --> &not-an-entity-ref;
<a href="" /> </p><p> <span></span></style>
Expand Down Expand Up @@ -377,9 +407,15 @@ def test_convert_charrefs(self):
('starttag', 'script', []), ('data', text),
('endtag', 'script'), ('data', '"'),
('starttag', 'style', []), ('data', text),
('endtag', 'style'), ('data', '"')]
('endtag', 'style'), ('data', '"'),
('starttag', 'title', []), ('data', text),
('endtag', 'title'), ('data', '"'),
('starttag', 'textarea', []), ('data', text),
('endtag', 'textarea'), ('data', '"')]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not correct. Charrefs should be resolved in escapable raw text elements. Data should be '"X"X"' instead of text. Except for an ambiguous ampersand.

self._run_check('{1}<script>{0}</script>{1}'
'<style>{0}</style>{1}'.format(text, charref),
'<style>{0}</style>{1}'
'<title>{0}</title>{1}'
'<textarea>{0}</textarea>{1}'.format(text, charref),
expected, collector=collector())
# check truncated charrefs at the end of the file
html = '&quo &# &#x'
Expand Down
Loading