Skip to content

Commit 053cbd3

Browse files
committed
fix: Some web pages are unable to be crawled
1 parent 76c0b66 commit 053cbd3

File tree

4 files changed

+33
-2
lines changed

4 files changed

+33
-2
lines changed

apps/common/util/fork.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,28 @@ def remove_fragment(url: str) -> str:
5353
return urlunparse(modified_url)
5454

5555

56+
def remove_last_path_robust(url):
57+
"""健壮地删除URL的最后一个路径部分"""
58+
parsed = urlparse(url)
59+
60+
# 分割路径并过滤空字符串
61+
paths = [p for p in parsed.path.split('/') if p]
62+
63+
if paths:
64+
paths.pop() # 移除最后一个路径
65+
66+
# 重建路径
67+
new_path = '/' + '/'.join(paths) if paths else '/'
68+
69+
# 重建URL
70+
return urlunparse((
71+
parsed.scheme,
72+
parsed.netloc,
73+
new_path,
74+
parsed.params,
75+
parsed.query,
76+
parsed.fragment
77+
))
5678
class Fork:
5779
class Response:
5880
def __init__(self, content: str, child_link_list: List[ChildLink], status, message: str):
@@ -72,7 +94,7 @@ def error(message: str):
7294
def __init__(self, base_fork_url: str, selector_list: List[str]):
7395
base_fork_url = remove_fragment(base_fork_url)
7496
if any([True for end_str in ['index.html', '.htm', '.html'] if base_fork_url.endswith(end_str)]):
75-
self.base_fork_url = str(Path(base_fork_url).parent)
97+
base_fork_url =remove_last_path_robust(base_fork_url)
7698
self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.')
7799
parsed = urlsplit(base_fork_url)
78100
query = parsed.query
@@ -190,4 +212,4 @@ def fork(self):
190212
def handler(base_url, response: Fork.Response):
191213
print(base_url.url, base_url.tag.text if base_url.tag else None, response.content)
192214

193-
# ForkManage('https://bbs.fit2cloud.com/c/de/6', ['.md-content']).fork(3, set(), handler)
215+
# ForkManage('https://hzqcgc.htc.edu.cn/jxky.htm', ['.md-content']).fork(3, set(), handler)

ui/src/locales/lang/en-US/ai-chat.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ export default {
6363
limitMessage2: 'files',
6464
sizeLimit: 'Each file must not exceed',
6565
imageMessage: 'Please process the image content',
66+
documentMessage: 'Please understand the content of the document',
67+
audioMessage: 'Please understand the video content',
68+
otherMessage: 'Please understand the file content',
6669
errorMessage: 'Upload Failed'
6770
},
6871
executionDetails: {

ui/src/locales/lang/zh-CN/ai-chat.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ export default {
6161
limitMessage2: '个文件',
6262
sizeLimit: '单个文件大小不能超过',
6363
imageMessage: '请解析图片内容',
64+
documentMessage: '请理解文档内容',
65+
audioMessage: '请理解视频内容',
66+
otherMessage: '请理解文件内容',
6467
errorMessage: '上传失败'
6568
},
6669
executionDetails: {

ui/src/locales/lang/zh-Hant/ai-chat.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ export default {
6161
limitMessage2: '個文件',
6262
sizeLimit: '單個文件大小不能超過',
6363
imageMessage: '請解析圖片內容',
64+
documentMessage: '請理解檔案內容',
65+
audioMessage: '請理解視頻內容',
66+
otherMessage: '請理解檔案內容',
6467
errorMessage: '上傳失敗'
6568
},
6669
executionDetails: {

0 commit comments

Comments
 (0)