Skip to content
25 changes: 23 additions & 2 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
escapable_raw_text_close = re.compile('</(title|textarea)>', re.I)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it even used?

commentclose = re.compile(r'--\s*>')
# Note:
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
Expand Down Expand Up @@ -82,6 +83,7 @@ class HTMLParser(_markupbase.ParserBase):
"""

CDATA_CONTENT_ELEMENTS = ("script", "style")
ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea")

def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
Expand All @@ -99,6 +101,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self.escapable_raw_text_elem = None
super().reset()

def feed(self, data):
Expand All @@ -120,6 +123,14 @@ def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text

def set_escapable_raw_text_mode(self, elem):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the behavior for raw text elements and escapable raw text elements is so similar, and they cannot be nested, why not use set_cdata_mode() and cdata_elem for both? Just add an optional boolean parameter to specify whether it is escapable (charrefs should be unescaped) or not.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@serhiy-storchaka I can do that.

self.escapable_raw_text_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.escapable_raw_text_elem, re.I)

def clear_escapable_raw_text_mode(self):
self.interesting = interesting_normal
self.escapable_raw_text_elem = None

def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Expand All @@ -136,7 +147,7 @@ def goahead(self, end):
i = 0
n = len(rawdata)
while i < n:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
Expand All @@ -155,11 +166,13 @@ def goahead(self, end):
if match:
j = match.start()
else:
if self.escapable_raw_text_elem:
break
if self.cdata_elem:
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is incorrect. Charrefs should be resolved in an escapable raw text element. Except an ambiguous ampersand.

We need also tests for convert_charrefs=False in an escapable raw text element.

self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
Expand Down Expand Up @@ -336,6 +349,8 @@ def parse_starttag(self, i):
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS:
self.set_escapable_raw_text_mode(tag)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
Expand Down Expand Up @@ -411,8 +426,14 @@ def parse_endtag(self, i):
self.handle_data(rawdata[i:gtpos])
return gtpos

if self.escapable_raw_text_elem is not None: # title or textarea
if elem != self.escapable_raw_text_elem:
self.handle_data(rawdata[i:gtpos])
return gtpos

self.handle_endtag(elem)
self.clear_cdata_mode()
self.clear_escapable_raw_text_mode()
return gtpos

# Overridable -- finish processing of start+end tag: <tag.../>
Expand Down
54 changes: 53 additions & 1 deletion Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def test_cdata_content(self):
#'foo = </\nscript>',
#'foo = </ script>',
]
elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
elements = ['script', 'style', 'SCRIPT', 'TEXTAREA', 'Script', 'Textarea']
for content in contents:
for element in elements:
element_lower = element.lower()
Expand Down Expand Up @@ -317,6 +317,58 @@ def get_events(self):
("endtag", element_lower)],
collector=Collector(convert_charrefs=False))

def test_escapable_raw_text_content(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this test differ from test_cdata_content? BTW, most examples use JavaScript syntax, and only relevant for <script>.

contents = [
'<h2>This is a header</h2>',
'Rebelious<h1>Heading'
'<!-- not a comment --> &not-an-entity-ref;',
"<not a='start tag'>",
'<a href="" /> <p> <span></span>',
'foo = "</scr" + "ipt>";',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why test this in the title and textarea elements?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add also examples of valid character references and an ambiguous ampersand.

'foo = "</TITLE" + ">";',
'foo = <\n/title> ',
'<!-- document.write("</scr" + "ipt>"); -->',
'\n//<![CDATA[\n'
'\n<!-- //\nvar foo = 3.14;\n// -->\n',
'foo = "</sty" + "le>";',
'<!-- \u2603 -->',
# these two should be invalid according to the HTML 5 spec,
# section 8.1.2.2
#'foo = </\nscript>',
#'foo = </ script>',
]
elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea']
for content in contents:
for element in elements:
element_lower = element.lower()
s = '<{element}>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", element_lower, []),
("data", content),
("endtag", element_lower)])

def test_escapable_raw_text_with_closing_tags(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it right? The test name is test_escapable_raw_text_with_closing_tags, but it tests the script element. It looks very similar to test_cdata_with_closing_tags.

# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
# The normal event collector normalizes the events in get_events,
# so we override it to return the original list of events.
class Collector(EventCollector):
def get_events(self):
return self.events

content = """<!-- not a comment --> &not-an-entity-ref;
<a href="" /> </p><p> <span></span></style>
'</script' + '>'"""
for element in [' script', 'script ', ' script ',
'\nscript', 'script\n', '\nscript\n']:
element_lower = element.lower().strip()
s = '<script>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", element_lower, []),
("data", content),
("endtag", element_lower)],
collector=Collector(convert_charrefs=False))

def test_comments(self):
html = ("<!-- I'm a valid comment -->"
'<!--me too!-->'
Expand Down