Skip to content

Commit 2824921

Browse files
Merge pull request #9505 from jonathanhefner/relax-fortunes-html-validation
Relax HTML validation for Fortunes test
2 parents d77f904 + 7f84497 commit 2824921

File tree

1 file changed

+31
-16
lines changed

1 file changed

+31
-16
lines changed

toolset/test_types/fortune/fortune_html_parser.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,17 @@
88

99

1010
class FortuneHTMLParser(HTMLParser):
11+
IGNORED_TAGS = (
12+
"<meta>", "</meta>",
13+
"<link>", "</link>",
14+
"<script>", "</script>",
15+
"<thead>", "</thead>",
16+
"<tbody>", "</tbody>",
17+
)
18+
1119
def __init__(self):
1220
HTMLParser.__init__(self, convert_charrefs=False)
21+
self.ignore_content = False
1322
self.body = []
1423

1524
valid_fortune = '''<!doctype html><html>
@@ -41,7 +50,7 @@ def handle_decl(self, decl):
4150
# and since we did not specify xml compliance (where
4251
# incorrect casing would throw a syntax error), we must
4352
# allow all casings. We will lower for our normalization.
44-
self.body.append("<!{d}>".format(d=decl.lower()))
53+
self.append("<!{d}>".format(d=decl.lower()))
4554

4655
def handle_charref(self, name):
4756
'''
@@ -63,58 +72,58 @@ def handle_charref(self, name):
6372
# equality.
6473
if val == "34" or val == "034" or val == "x22":
6574
# Append our normalized entity reference to our body.
66-
self.body.append("&quot;")
75+
self.append("&quot;")
6776
# "&#39;" is a valid escaping of "-", but it is not
6877
# required, so we normalize for equality checking.
6978
if val == "39" or val == "039" or val == "x27":
70-
self.body.append("&apos;")
79+
self.append("&apos;")
7180
# Again, "&#43;" is a valid escaping of the "+", but
7281
# it is not required, so we need to normalize for out
7382
# final parse and equality check.
7483
if val == "43" or val == "043" or val == "x2b":
75-
self.body.append("+")
84+
self.append("+")
7685
# Again, "&#62;" is a valid escaping of ">", but we
7786
# need to normalize to "&gt;" for equality checking.
7887
if val == "62" or val == "062" or val == "x3e":
79-
self.body.append("&gt;")
88+
self.append("&gt;")
8089
# Again, "&#60;" is a valid escaping of "<", but we
8190
# need to normalize to "&lt;" for equality checking.
8291
if val == "60" or val == "060" or val == "x3c":
83-
self.body.append("&lt;")
92+
self.append("&lt;")
8493
# Not sure why some are escaping '/'
8594
if val == "47" or val == "047" or val == "x2f":
86-
self.body.append("/")
95+
self.append("/")
8796
# "&#40;" is a valid escaping of "(", but
8897
# it is not required, so we need to normalize for out
8998
# final parse and equality check.
9099
if val == "40" or val == "040" or val == "x28":
91-
self.body.append("(")
100+
self.append("(")
92101
# "&#41;" is a valid escaping of ")", but
93102
# it is not required, so we need to normalize for out
94103
# final parse and equality check.
95104
if val == "41" or val == "041" or val == "x29":
96-
self.body.append(")")
105+
self.append(")")
97106

98107
def handle_entityref(self, name):
99108
'''
100109
Again, "&mdash;" is a valid escaping of "—", but we
101110
need to normalize to "—" for equality checking.
102111
'''
103112
if name == "mdash":
104-
self.body.append("—")
113+
self.append("—")
105114
else:
106-
self.body.append("&{n};".format(n=name))
115+
self.append("&{n};".format(n=name))
107116

108117
def handle_starttag(self, tag, attrs):
109118
'''
110119
This is called every time a tag is opened. We append
111120
each one wrapped in "<" and ">".
112121
'''
113-
self.body.append("<{t}>".format(t=tag))
122+
self.append("<{t}>".format(t=tag))
114123

115124
# Append a newline after the <table> and <html>
116125
if tag.lower() == 'table' or tag.lower() == 'html':
117-
self.body.append(os.linesep)
126+
self.append(os.linesep)
118127

119128
def handle_data(self, data):
120129
'''
@@ -146,18 +155,24 @@ def handle_data(self, data):
146155
data = data.replace('"', '&quot;')
147156
data = data.replace('>', '&gt;')
148157

149-
self.body.append("{d}".format(d=data))
158+
self.append("{d}".format(d=data))
150159

151160
def handle_endtag(self, tag):
152161
'''
153162
This is called every time a tag is closed. We append
154163
each one wrapped in "</" and ">".
155164
'''
156-
self.body.append("</{t}>".format(t=tag))
165+
self.append("</{t}>".format(t=tag))
157166

158167
# Append a newline after each </tr> and </head>
159168
if tag.lower() == 'tr' or tag.lower() == 'head':
160-
self.body.append(os.linesep)
169+
self.append(os.linesep)
170+
171+
def append(self, item):
172+
self.ignore_content = item == "<script>" or (self.ignore_content and item != "</script>")
173+
174+
if not (self.ignore_content or item in self.IGNORED_TAGS):
175+
self.body.append(item)
161176

162177
def isValidFortune(self, name, out):
163178
'''

0 commit comments

Comments
 (0)