-
-
Notifications
You must be signed in to change notification settings - Fork 33.2k
gh-118350: Add escapable-raw-text mode to html parser #121770
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
420af54
1241a65
e7f11a0
bd63490
da868db
d17b409
d8cc255
a36070a
43804bb
70b8e5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ | |
|
||
starttagopen = re.compile('<[a-zA-Z]') | ||
piclose = re.compile('>') | ||
escapable_raw_text_close = re.compile('</(title|textarea)>', re.I) | ||
commentclose = re.compile(r'--\s*>') | ||
# Note: | ||
# 1) if you change tagfind/attrfind remember to update locatestarttagend too; | ||
|
@@ -100,6 +101,7 @@ class HTMLParser(_markupbase.ParserBase): | |
""" | ||
|
||
CDATA_CONTENT_ELEMENTS = ("script", "style") | ||
ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea") | ||
|
||
def __init__(self, *, convert_charrefs=True): | ||
"""Initialize and reset this instance. | ||
|
@@ -117,6 +119,7 @@ def reset(self): | |
self.lasttag = '???' | ||
self.interesting = interesting_normal | ||
self.cdata_elem = None | ||
self.escapable_raw_text_elem = None | ||
super().reset() | ||
|
||
def feed(self, data): | ||
|
@@ -138,6 +141,14 @@ def get_starttag_text(self): | |
"""Return full source of start tag: '<...>'.""" | ||
return self.__starttag_text | ||
|
||
def set_escapable_raw_text_mode(self, elem): | ||
|
||
self.escapable_raw_text_elem = elem.lower() | ||
self.interesting = re.compile(r'</\s*%s\s*>' % self.escapable_raw_text_elem, re.I) | ||
|
||
def clear_escapable_raw_text_mode(self): | ||
self.interesting = interesting_normal | ||
self.escapable_raw_text_elem = None | ||
|
||
def set_cdata_mode(self, elem): | ||
self.cdata_elem = elem.lower() | ||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) | ||
|
@@ -154,7 +165,7 @@ def goahead(self, end): | |
i = 0 | ||
n = len(rawdata) | ||
while i < n: | ||
if self.convert_charrefs and not self.cdata_elem: | ||
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem: | ||
j = rawdata.find('<', i) | ||
if j < 0: | ||
# if we can't find the next <, either we are at the end | ||
|
@@ -173,11 +184,13 @@ def goahead(self, end): | |
if match: | ||
j = match.start() | ||
else: | ||
if self.escapable_raw_text_elem: | ||
break | ||
if self.cdata_elem: | ||
break | ||
j = n | ||
if i < j: | ||
if self.convert_charrefs and not self.cdata_elem: | ||
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem: | ||
|
||
self.handle_data(unescape(rawdata[i:j])) | ||
else: | ||
self.handle_data(rawdata[i:j]) | ||
|
@@ -354,6 +367,8 @@ def parse_starttag(self, i): | |
self.handle_startendtag(tag, attrs) | ||
else: | ||
self.handle_starttag(tag, attrs) | ||
if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS: | ||
self.set_escapable_raw_text_mode(tag) | ||
if tag in self.CDATA_CONTENT_ELEMENTS: | ||
self.set_cdata_mode(tag) | ||
return endpos | ||
|
@@ -429,8 +444,14 @@ def parse_endtag(self, i): | |
self.handle_data(rawdata[i:gtpos]) | ||
return gtpos | ||
|
||
if self.escapable_raw_text_elem is not None: # title or textarea | ||
if elem != self.escapable_raw_text_elem: | ||
self.handle_data(rawdata[i:gtpos]) | ||
return gtpos | ||
|
||
self.handle_endtag(elem) | ||
self.clear_cdata_mode() | ||
self.clear_escapable_raw_text_mode() | ||
return gtpos | ||
|
||
# Overridable -- finish processing of start+end tag: <tag.../> | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -285,7 +285,9 @@ def test_cdata_content(self): | |
#'foo = </\nscript>', | ||
#'foo = </ script>', | ||
] | ||
elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style'] | ||
tags = ['script', 'style', 'textarea', 'title'] | ||
# test the following 'casing' for each tag: script, SCRIPT, Script etc. | ||
elements = [f(tag) for tag in tags for f in (str.lower, str.upper, str.capitalize)] | ||
for content in contents: | ||
for element in elements: | ||
element_lower = element.lower() | ||
|
@@ -317,6 +319,34 @@ def get_events(self): | |
("endtag", element_lower)], | ||
collector=Collector(convert_charrefs=False)) | ||
|
||
def test_escapable_raw_text_content(self): | ||
|
||
contents = [ | ||
'foo = "</TITLE" + ">";', | ||
'foo = <\n/title> ', | ||
'<!-- document.write("</scr" + "ipt>"); -->', | ||
'\n//<![CDATA[\n' | ||
'\n<!-- //\nvar foo = 3.14;\n// -->\n', | ||
# valid character reference | ||
'A', | ||
# ambiguous ampersand example | ||
'¬aref', | ||
'foo = "</sty" + "le>";', | ||
'<!-- \u2603 -->', | ||
# these two should be invalid according to the HTML 5 spec, | ||
# section 8.1.2.2 | ||
#'foo = </\nscript>', | ||
#'foo = </ script>', | ||
] | ||
elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea'] | ||
for content in contents: | ||
for element in elements: | ||
element_lower = element.lower() | ||
s = '<{element}>{content}</{element}>'.format(element=element, | ||
content=content) | ||
self._run_check(s, [("starttag", element_lower, []), | ||
("data", content), | ||
("endtag", element_lower)]) | ||
|
||
def test_EOF_in_cdata(self): | ||
content = """<!-- not a comment --> ¬-an-entity-ref; | ||
<a href="" /> </p><p> <span></span></style> | ||
|
@@ -377,9 +407,15 @@ def test_convert_charrefs(self): | |
('starttag', 'script', []), ('data', text), | ||
('endtag', 'script'), ('data', '"'), | ||
('starttag', 'style', []), ('data', text), | ||
('endtag', 'style'), ('data', '"')] | ||
('endtag', 'style'), ('data', '"'), | ||
('starttag', 'title', []), ('data', text), | ||
('endtag', 'title'), ('data', '"'), | ||
('starttag', 'textarea', []), ('data', text), | ||
('endtag', 'textarea'), ('data', '"')] | ||
|
||
self._run_check('{1}<script>{0}</script>{1}' | ||
'<style>{0}</style>{1}'.format(text, charref), | ||
'<style>{0}</style>{1}' | ||
'<title>{0}</title>{1}' | ||
'<textarea>{0}</textarea>{1}'.format(text, charref), | ||
expected, collector=collector()) | ||
# check truncated charrefs at the end of the file | ||
html = '&quo &# &#x' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it even used?