8
8
9
9
10
10
class FortuneHTMLParser (HTMLParser ):
11
+ IGNORED_TAGS = (
12
+ "<meta>" , "</meta>" ,
13
+ "<link>" , "</link>" ,
14
+ "<script>" , "</script>" ,
15
+ "<thead>" , "</thead>" ,
16
+ "<tbody>" , "</tbody>" ,
17
+ )
18
+
11
19
def __init__ (self ):
12
20
HTMLParser .__init__ (self , convert_charrefs = False )
21
+ self .ignore_content = False
13
22
self .body = []
14
23
15
24
valid_fortune = '''<!doctype html><html>
@@ -41,7 +50,7 @@ def handle_decl(self, decl):
41
50
# and since we did not specify xml compliance (where
42
51
# incorrect casing would throw a syntax error), we must
43
52
# allow all casings. We will lower for our normalization.
44
- self .body . append ("<!{d}>" .format (d = decl .lower ()))
53
+ self .append ("<!{d}>" .format (d = decl .lower ()))
45
54
46
55
def handle_charref (self , name ):
47
56
'''
@@ -63,58 +72,58 @@ def handle_charref(self, name):
63
72
# equality.
64
73
if val == "34" or val == "034" or val == "x22" :
65
74
# Append our normalized entity reference to our body.
66
- self .body . append (""" )
75
+ self .append (""" )
67
76
# "'" is a valid escaping of "-", but it is not
68
77
# required, so we normalize for equality checking.
69
78
if val == "39" or val == "039" or val == "x27" :
70
- self .body . append ("'" )
79
+ self .append ("'" )
71
80
# Again, "+" is a valid escaping of the "+", but
72
81
# it is not required, so we need to normalize for out
73
82
# final parse and equality check.
74
83
if val == "43" or val == "043" or val == "x2b" :
75
- self .body . append ("+" )
84
+ self .append ("+" )
76
85
# Again, ">" is a valid escaping of ">", but we
77
86
# need to normalize to ">" for equality checking.
78
87
if val == "62" or val == "062" or val == "x3e" :
79
- self .body . append (">" )
88
+ self .append (">" )
80
89
# Again, "<" is a valid escaping of "<", but we
81
90
# need to normalize to "<" for equality checking.
82
91
if val == "60" or val == "060" or val == "x3c" :
83
- self .body . append ("<" )
92
+ self .append ("<" )
84
93
# Not sure why some are escaping '/'
85
94
if val == "47" or val == "047" or val == "x2f" :
86
- self .body . append ("/" )
95
+ self .append ("/" )
87
96
# "(" is a valid escaping of "(", but
88
97
# it is not required, so we need to normalize for out
89
98
# final parse and equality check.
90
99
if val == "40" or val == "040" or val == "x28" :
91
- self .body . append ("(" )
100
+ self .append ("(" )
92
101
# ")" is a valid escaping of ")", but
93
102
# it is not required, so we need to normalize for out
94
103
# final parse and equality check.
95
104
if val == "41" or val == "041" or val == "x29" :
96
- self .body . append (")" )
105
+ self .append (")" )
97
106
98
107
def handle_entityref (self , name ):
99
108
'''
100
109
Again, "—" is a valid escaping of "—", but we
101
110
need to normalize to "—" for equality checking.
102
111
'''
103
112
if name == "mdash" :
104
- self .body . append ("—" )
113
+ self .append ("—" )
105
114
else :
106
- self .body . append ("&{n};" .format (n = name ))
115
+ self .append ("&{n};" .format (n = name ))
107
116
108
117
def handle_starttag (self , tag , attrs ):
109
118
'''
110
119
This is called every time a tag is opened. We append
111
120
each one wrapped in "<" and ">".
112
121
'''
113
- self .body . append ("<{t}>" .format (t = tag ))
122
+ self .append ("<{t}>" .format (t = tag ))
114
123
115
124
# Append a newline after the <table> and <html>
116
125
if tag .lower () == 'table' or tag .lower () == 'html' :
117
- self .body . append (os .linesep )
126
+ self .append (os .linesep )
118
127
119
128
def handle_data (self , data ):
120
129
'''
@@ -146,18 +155,24 @@ def handle_data(self, data):
146
155
data = data .replace ('"' , '"' )
147
156
data = data .replace ('>' , '>' )
148
157
149
- self .body . append ("{d}" .format (d = data ))
158
+ self .append ("{d}" .format (d = data ))
150
159
151
160
def handle_endtag (self , tag ):
152
161
'''
153
162
This is called every time a tag is closed. We append
154
163
each one wrapped in "</" and ">".
155
164
'''
156
- self .body . append ("</{t}>" .format (t = tag ))
165
+ self .append ("</{t}>" .format (t = tag ))
157
166
158
167
# Append a newline after each </tr> and </head>
159
168
if tag .lower () == 'tr' or tag .lower () == 'head' :
160
- self .body .append (os .linesep )
169
+ self .append (os .linesep )
170
+
171
+ def append (self , item ):
172
+ self .ignore_content = item == "<script>" or (self .ignore_content and item != "</script>" )
173
+
174
+ if not (self .ignore_content or item in self .IGNORED_TAGS ):
175
+ self .body .append (item )
161
176
162
177
def isValidFortune (self , name , out ):
163
178
'''
0 commit comments