Skip to content

Commit 9d4679a

Browse files
committed
fix: improve charset detection in HTML parsing
1 parent 3710028 commit 9d4679a

File tree

1 file changed

+16
-4
lines changed

1 file changed

+16
-4
lines changed

apps/common/util/fork.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,18 +137,30 @@ def get_beautiful_soup(response):
137137
html_content = response.content.decode(encoding)
138138
beautiful_soup = BeautifulSoup(html_content, "html.parser")
139139
meta_list = beautiful_soup.find_all('meta')
140-
charset_list = [meta.attrs.get('charset') for meta in meta_list if
141-
meta.attrs is not None and 'charset' in meta.attrs]
140+
charset_list = Fork.get_charset_list(meta_list)
142141
if len(charset_list) > 0:
143142
charset = charset_list[0]
144143
if charset != encoding:
145144
try:
146-
html_content = response.content.decode(charset)
145+
html_content = response.content.decode(charset, errors='replace')
147146
except Exception as e:
148-
logging.getLogger("max_kb").error(f'{e}')
147+
logging.getLogger("max_kb").error(f'{e}: {traceback.format_exc()}')
149148
return BeautifulSoup(html_content, "html.parser")
150149
return beautiful_soup
151150

151+
@staticmethod
152+
def get_charset_list(meta_list):
153+
charset_list = []
154+
for meta in meta_list:
155+
if meta.attrs is not None:
156+
if 'charset' in meta.attrs:
157+
charset_list.append(meta.attrs.get('charset'))
158+
elif meta.attrs.get('http-equiv', '').lower() == 'content-type' and 'content' in meta.attrs:
159+
match = re.search(r'charset=([^\s;]+)', meta.attrs['content'], re.I)
160+
if match:
161+
charset_list.append(match.group(1))
162+
return charset_list
163+
152164
def fork(self):
153165
try:
154166

0 commit comments

Comments
 (0)