-
-
Notifications
You must be signed in to change notification settings - Fork 33.2k
gh-118350: Add escapable-raw-text mode to html parser #121770
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
420af54
1241a65
e7f11a0
bd63490
da868db
d17b409
d8cc255
a36070a
43804bb
70b8e5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ | |
|
||
starttagopen = re.compile('<[a-zA-Z]') | ||
piclose = re.compile('>') | ||
escapable_raw_text_close = re.compile('</(title|textarea)>', re.I) | ||
commentclose = re.compile(r'--\s*>') | ||
# Note: | ||
# 1) if you change tagfind/attrfind remember to update locatestarttagend too; | ||
|
@@ -82,6 +83,7 @@ class HTMLParser(_markupbase.ParserBase): | |
""" | ||
|
||
CDATA_CONTENT_ELEMENTS = ("script", "style") | ||
ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea") | ||
|
||
def __init__(self, *, convert_charrefs=True): | ||
"""Initialize and reset this instance. | ||
|
@@ -99,6 +101,7 @@ def reset(self): | |
self.lasttag = '???' | ||
self.interesting = interesting_normal | ||
self.cdata_elem = None | ||
self.escapable_raw_text_elem = None | ||
super().reset() | ||
|
||
def feed(self, data): | ||
|
@@ -120,6 +123,14 @@ def get_starttag_text(self): | |
"""Return full source of start tag: '<...>'.""" | ||
return self.__starttag_text | ||
|
||
def set_escapable_raw_text_mode(self, elem): | ||
|
||
self.escapable_raw_text_elem = elem.lower() | ||
self.interesting = re.compile(r'</\s*%s\s*>' % self.escapable_raw_text_elem, re.I) | ||
|
||
def clear_escapable_raw_text_mode(self): | ||
self.interesting = interesting_normal | ||
self.escapable_raw_text_elem = None | ||
|
||
def set_cdata_mode(self, elem): | ||
self.cdata_elem = elem.lower() | ||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) | ||
|
@@ -136,7 +147,7 @@ def goahead(self, end): | |
i = 0 | ||
n = len(rawdata) | ||
while i < n: | ||
if self.convert_charrefs and not self.cdata_elem: | ||
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem: | ||
j = rawdata.find('<', i) | ||
if j < 0: | ||
# if we can't find the next <, either we are at the end | ||
|
@@ -155,11 +166,13 @@ def goahead(self, end): | |
if match: | ||
j = match.start() | ||
else: | ||
if self.escapable_raw_text_elem: | ||
break | ||
if self.cdata_elem: | ||
break | ||
j = n | ||
if i < j: | ||
if self.convert_charrefs and not self.cdata_elem: | ||
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem: | ||
|
||
self.handle_data(unescape(rawdata[i:j])) | ||
else: | ||
self.handle_data(rawdata[i:j]) | ||
|
@@ -336,6 +349,8 @@ def parse_starttag(self, i): | |
self.handle_startendtag(tag, attrs) | ||
else: | ||
self.handle_starttag(tag, attrs) | ||
if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS: | ||
self.set_escapable_raw_text_mode(tag) | ||
if tag in self.CDATA_CONTENT_ELEMENTS: | ||
self.set_cdata_mode(tag) | ||
return endpos | ||
|
@@ -411,8 +426,14 @@ def parse_endtag(self, i): | |
self.handle_data(rawdata[i:gtpos]) | ||
return gtpos | ||
|
||
if self.escapable_raw_text_elem is not None: # title or textarea | ||
if elem != self.escapable_raw_text_elem: | ||
self.handle_data(rawdata[i:gtpos]) | ||
return gtpos | ||
|
||
self.handle_endtag(elem) | ||
self.clear_cdata_mode() | ||
self.clear_escapable_raw_text_mode() | ||
return gtpos | ||
|
||
# Overridable -- finish processing of start+end tag: <tag.../> | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -285,7 +285,7 @@ def test_cdata_content(self): | |
#'foo = </\nscript>', | ||
#'foo = </ script>', | ||
] | ||
elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style'] | ||
elements = ['script', 'style', 'SCRIPT', 'TEXTAREA', 'Script', 'Textarea'] | ||
for content in contents: | ||
for element in elements: | ||
element_lower = element.lower() | ||
|
@@ -317,6 +317,58 @@ def get_events(self): | |
("endtag", element_lower)], | ||
collector=Collector(convert_charrefs=False)) | ||
|
||
def test_escapable_raw_text_content(self): | ||
|
||
contents = [ | ||
'<h2>This is a header</h2>', | ||
'Rebelious<h1>Heading' | ||
'<!-- not a comment --> ¬-an-entity-ref;', | ||
"<not a='start tag'>", | ||
'<a href="" /> <p> <span></span>', | ||
'foo = "</scr" + "ipt>";', | ||
|
||
'foo = "</TITLE" + ">";', | ||
'foo = <\n/title> ', | ||
'<!-- document.write("</scr" + "ipt>"); -->', | ||
'\n//<![CDATA[\n' | ||
'\n<!-- //\nvar foo = 3.14;\n// -->\n', | ||
'foo = "</sty" + "le>";', | ||
'<!-- \u2603 -->', | ||
# these two should be invalid according to the HTML 5 spec, | ||
# section 8.1.2.2 | ||
#'foo = </\nscript>', | ||
#'foo = </ script>', | ||
] | ||
elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea'] | ||
for content in contents: | ||
for element in elements: | ||
element_lower = element.lower() | ||
s = '<{element}>{content}</{element}>'.format(element=element, | ||
content=content) | ||
self._run_check(s, [("starttag", element_lower, []), | ||
("data", content), | ||
("endtag", element_lower)]) | ||
|
||
def test_escapable_raw_text_with_closing_tags(self): | ||
|
||
# see issue #13358 | ||
# make sure that HTMLParser calls handle_data only once for each CDATA. | ||
# The normal event collector normalizes the events in get_events, | ||
# so we override it to return the original list of events. | ||
class Collector(EventCollector): | ||
def get_events(self): | ||
return self.events | ||
|
||
content = """<!-- not a comment --> ¬-an-entity-ref; | ||
<a href="" /> </p><p> <span></span></style> | ||
'</script' + '>'""" | ||
for element in [' script', 'script ', ' script ', | ||
'\nscript', 'script\n', '\nscript\n']: | ||
element_lower = element.lower().strip() | ||
s = '<script>{content}</{element}>'.format(element=element, | ||
content=content) | ||
self._run_check(s, [("starttag", element_lower, []), | ||
("data", content), | ||
("endtag", element_lower)], | ||
collector=Collector(convert_charrefs=False)) | ||
|
||
def test_comments(self): | ||
html = ("<!-- I'm a valid comment -->" | ||
'<!--me too!-->' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it even used?