|
1 | 1 | import copy |
2 | | -import logging |
3 | 2 | import re |
4 | 3 | import traceback |
5 | 4 | from functools import reduce |
@@ -139,18 +138,30 @@ def get_beautiful_soup(response): |
139 | 138 | html_content = response.content.decode(encoding) |
140 | 139 | beautiful_soup = BeautifulSoup(html_content, "html.parser") |
141 | 140 | meta_list = beautiful_soup.find_all('meta') |
142 | | - charset_list = [meta.attrs.get('charset') for meta in meta_list if |
143 | | - meta.attrs is not None and 'charset' in meta.attrs] |
| 141 | + charset_list = Fork.get_charset_list(meta_list) |
144 | 142 | if len(charset_list) > 0: |
145 | 143 | charset = charset_list[0] |
146 | 144 | if charset != encoding: |
147 | 145 | try: |
148 | | - html_content = response.content.decode(charset) |
| 146 | + html_content = response.content.decode(charset, errors='replace') |
149 | 147 | except Exception as e: |
150 | | - maxkb_logger.error(f'{e}') |
| 148 | + maxkb_logger.error(f'{e}: {traceback.format_exc()}') |
151 | 149 | return BeautifulSoup(html_content, "html.parser") |
152 | 150 | return beautiful_soup |
153 | 151 |
|
| 152 | + @staticmethod |
| 153 | + def get_charset_list(meta_list): |
| 154 | + charset_list = [] |
| 155 | + for meta in meta_list: |
| 156 | + if meta.attrs is not None: |
| 157 | + if 'charset' in meta.attrs: |
| 158 | + charset_list.append(meta.attrs.get('charset')) |
| 159 | + elif meta.attrs.get('http-equiv', '').lower() == 'content-type' and 'content' in meta.attrs: |
| 160 | + match = re.search(r'charset=([^\s;]+)', meta.attrs['content'], re.I) |
| 161 | + if match: |
| 162 | + charset_list.append(match.group(1)) |
| 163 | + return charset_list |
| 164 | + |
154 | 165 | def fork(self): |
155 | 166 | try: |
156 | 167 |
|
|
0 commit comments