88
99
1010class FortuneHTMLParser (HTMLParser ):
11+ IGNORED_TAGS = (
12+ "<meta>" , "</meta>" ,
13+ "<link>" , "</link>" ,
14+ "<script>" , "</script>" ,
15+ "<thead>" , "</thead>" ,
16+ "<tbody>" , "</tbody>" ,
17+ )
18+
1119 def __init__ (self ):
1220 HTMLParser .__init__ (self , convert_charrefs = False )
21+ self .ignore_content = False
1322 self .body = []
1423
1524 valid_fortune = '''<!doctype html><html>
@@ -41,7 +50,7 @@ def handle_decl(self, decl):
4150 # and since we did not specify xml compliance (where
4251 # incorrect casing would throw a syntax error), we must
4352 # allow all casings. We will lower for our normalization.
44- self .body . append ("<!{d}>" .format (d = decl .lower ()))
53+ self .append ("<!{d}>" .format (d = decl .lower ()))
4554
4655 def handle_charref (self , name ):
4756 '''
@@ -63,58 +72,58 @@ def handle_charref(self, name):
6372 # equality.
6473 if val == "34" or val == "034" or val == "x22" :
6574 # Append our normalized entity reference to our body.
66- self .body . append (""" )
75+ self .append (""" )
6776 # "'" is a valid escaping of "-", but it is not
6877 # required, so we normalize for equality checking.
6978 if val == "39" or val == "039" or val == "x27" :
70- self .body . append ("'" )
79+ self .append ("'" )
7180 # Again, "+" is a valid escaping of the "+", but
7281 # it is not required, so we need to normalize for out
7382 # final parse and equality check.
7483 if val == "43" or val == "043" or val == "x2b" :
75- self .body . append ("+" )
84+ self .append ("+" )
7685 # Again, ">" is a valid escaping of ">", but we
7786 # need to normalize to ">" for equality checking.
7887 if val == "62" or val == "062" or val == "x3e" :
79- self .body . append (">" )
88+ self .append (">" )
8089 # Again, "<" is a valid escaping of "<", but we
8190 # need to normalize to "<" for equality checking.
8291 if val == "60" or val == "060" or val == "x3c" :
83- self .body . append ("<" )
92+ self .append ("<" )
8493 # Not sure why some are escaping '/'
8594 if val == "47" or val == "047" or val == "x2f" :
86- self .body . append ("/" )
95+ self .append ("/" )
8796 # "(" is a valid escaping of "(", but
8897 # it is not required, so we need to normalize for out
8998 # final parse and equality check.
9099 if val == "40" or val == "040" or val == "x28" :
91- self .body . append ("(" )
100+ self .append ("(" )
92101 # ")" is a valid escaping of ")", but
93102 # it is not required, so we need to normalize for out
94103 # final parse and equality check.
95104 if val == "41" or val == "041" or val == "x29" :
96- self .body . append (")" )
105+ self .append (")" )
97106
98107 def handle_entityref (self , name ):
99108 '''
100109 Again, "—" is a valid escaping of "—", but we
101110 need to normalize to "—" for equality checking.
102111 '''
103112 if name == "mdash" :
104- self .body . append ("—" )
113+ self .append ("—" )
105114 else :
106- self .body . append ("&{n};" .format (n = name ))
115+ self .append ("&{n};" .format (n = name ))
107116
108117 def handle_starttag (self , tag , attrs ):
109118 '''
110119 This is called every time a tag is opened. We append
111120 each one wrapped in "<" and ">".
112121 '''
113- self .body . append ("<{t}>" .format (t = tag ))
122+ self .append ("<{t}>" .format (t = tag ))
114123
115124 # Append a newline after the <table> and <html>
116125 if tag .lower () == 'table' or tag .lower () == 'html' :
117- self .body . append (os .linesep )
126+ self .append (os .linesep )
118127
119128 def handle_data (self , data ):
120129 '''
@@ -146,18 +155,24 @@ def handle_data(self, data):
146155 data = data .replace ('"' , '"' )
147156 data = data .replace ('>' , '>' )
148157
149- self .body . append ("{d}" .format (d = data ))
158+ self .append ("{d}" .format (d = data ))
150159
151160 def handle_endtag (self , tag ):
152161 '''
153162 This is called every time a tag is closed. We append
154163 each one wrapped in "</" and ">".
155164 '''
156- self .body . append ("</{t}>" .format (t = tag ))
165+ self .append ("</{t}>" .format (t = tag ))
157166
158167 # Append a newline after each </tr> and </head>
159168 if tag .lower () == 'tr' or tag .lower () == 'head' :
160- self .body .append (os .linesep )
169+ self .append (os .linesep )
170+
171+ def append (self , item ):
172+ self .ignore_content = item == "<script>" or (self .ignore_content and item != "</script>" )
173+
174+ if not (self .ignore_content or item in self .IGNORED_TAGS ):
175+ self .body .append (item )
161176
162177 def isValidFortune (self , name , out ):
163178 '''
0 commit comments