File tree Expand file tree Collapse file tree 1 file changed +16
-4
lines changed
Expand file tree Collapse file tree 1 file changed +16
-4
lines changed Original file line number Diff line number Diff line change @@ -137,18 +137,30 @@ def get_beautiful_soup(response):
137137 html_content = response .content .decode (encoding )
138138 beautiful_soup = BeautifulSoup (html_content , "html.parser" )
139139 meta_list = beautiful_soup .find_all ('meta' )
140- charset_list = [meta .attrs .get ('charset' ) for meta in meta_list if
141- meta .attrs is not None and 'charset' in meta .attrs ]
140+ charset_list = Fork .get_charset_list (meta_list )
142141 if len (charset_list ) > 0 :
143142 charset = charset_list [0 ]
144143 if charset != encoding :
145144 try :
146- html_content = response .content .decode (charset )
145+ html_content = response .content .decode (charset , errors = 'replace' )
147146 except Exception as e :
148- logging .getLogger ("max_kb" ).error (f'{ e } ' )
147+ logging .getLogger ("max_kb" ).error (f'{ e } : { traceback . format_exc () } ' )
149148 return BeautifulSoup (html_content , "html.parser" )
150149 return beautiful_soup
151150
151+ @staticmethod
152+ def get_charset_list (meta_list ):
153+ charset_list = []
154+ for meta in meta_list :
155+ if meta .attrs is not None :
156+ if 'charset' in meta .attrs :
157+ charset_list .append (meta .attrs .get ('charset' ))
158+ elif meta .attrs .get ('http-equiv' , '' ).lower () == 'content-type' and 'content' in meta .attrs :
159+ match = re .search (r'charset=([^\s;]+)' , meta .attrs ['content' ], re .I )
160+ if match :
161+ charset_list .append (match .group (1 ))
162+ return charset_list
163+
152164 def fork (self ):
153165 try :
154166
You can’t perform that action at this time.
0 commit comments