Skip to content

Commit 6184058

Browse files
committed
feat: implement charset detection utility in fork.py
--bug=1059829 --user=刘瑞斌 【知识库】web导入知识库中文字符乱码 https://www.tapd.cn/62980211/s/1746162
1 parent 1742db6 commit 6184058

File tree

1 file changed

+16
-5
lines changed

1 file changed

+16
-5
lines changed

apps/common/utils/fork.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import copy
2-
import logging
32
import re
43
import traceback
54
from functools import reduce
@@ -139,18 +138,30 @@ def get_beautiful_soup(response):
139138
html_content = response.content.decode(encoding)
140139
beautiful_soup = BeautifulSoup(html_content, "html.parser")
141140
meta_list = beautiful_soup.find_all('meta')
142-
charset_list = [meta.attrs.get('charset') for meta in meta_list if
143-
meta.attrs is not None and 'charset' in meta.attrs]
141+
charset_list = Fork.get_charset_list(meta_list)
144142
if len(charset_list) > 0:
145143
charset = charset_list[0]
146144
if charset != encoding:
147145
try:
148-
html_content = response.content.decode(charset)
146+
html_content = response.content.decode(charset, errors='replace')
149147
except Exception as e:
150-
maxkb_logger.error(f'{e}')
148+
maxkb_logger.error(f'{e}: {traceback.format_exc()}')
151149
return BeautifulSoup(html_content, "html.parser")
152150
return beautiful_soup
153151

152+
@staticmethod
153+
def get_charset_list(meta_list):
154+
charset_list = []
155+
for meta in meta_list:
156+
if meta.attrs is not None:
157+
if 'charset' in meta.attrs:
158+
charset_list.append(meta.attrs.get('charset'))
159+
elif meta.attrs.get('http-equiv', '').lower() == 'content-type' and 'content' in meta.attrs:
160+
match = re.search(r'charset=([^\s;]+)', meta.attrs['content'], re.I)
161+
if match:
162+
charset_list.append(match.group(1))
163+
return charset_list
164+
154165
def fork(self):
155166
try:
156167

0 commit comments

Comments
 (0)