Skip to content

Commit 0ff771d

Browse files
committed
fix: 禁用URL文档上传与解析功能
出于安全考虑,移除所有URL文档上传与解析相关功能: 1. 在知识库路由层添加URL上传校验 2. 移除URL元数据生成逻辑 3. 禁用URL转markdown功能 4. 在各知识库实现中移除URL处理逻辑 5. 在前端禁用URL上传选项并添加提示
1 parent 55f2e02 commit 0ff771d

File tree

7 files changed

+44
-112
lines changed

7 files changed

+44
-112
lines changed

server/routers/knowledge_router.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,10 @@ async def add_documents(
210210

211211
content_type = params.get("content_type", "file")
212212

213+
# 禁止 URL 解析与入库
214+
if content_type == "url":
215+
raise HTTPException(status_code=400, detail="URL 文档上传与解析已禁用")
216+
213217
# 安全检查:验证文件路径
214218
if content_type == "file":
215219
from src.knowledge.utils.kb_utils import validate_file_path

src/knowledge/implementations/chroma.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
1010

1111
from src.knowledge.base import KnowledgeBase
12-
from src.knowledge.indexing import process_file_to_markdown, process_url_to_markdown
12+
from src.knowledge.indexing import process_file_to_markdown
1313
from src.knowledge.utils.kb_utils import (
1414
get_embedding_config,
1515
prepare_item_metadata,
@@ -204,10 +204,9 @@ async def add_content(self, db_id: str, items: list[str], params: dict | None) -
204204
params["db_id"] = db_id
205205

206206
# 根据内容类型处理内容
207-
if content_type == "file":
208-
markdown_content = await process_file_to_markdown(item, params=params)
209-
else: # URL
210-
markdown_content = await process_url_to_markdown(item, params=params)
207+
if content_type != "file":
208+
raise ValueError("URL 内容解析已禁用")
209+
markdown_content = await process_file_to_markdown(item, params=params)
211210

212211
# 分割文本成块
213212
chunks = self._split_text_into_chunks(markdown_content, file_id, filename, params)
@@ -296,10 +295,9 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
296295
self._save_metadata()
297296

298297
# 重新解析文件为 markdown
299-
if content_type == "file":
300-
markdown_content = await process_file_to_markdown(file_path, params=params)
301-
else:
302-
markdown_content = await process_url_to_markdown(file_path, params=params)
298+
if content_type != "file":
299+
raise ValueError("URL 内容解析已禁用")
300+
markdown_content = await process_file_to_markdown(file_path, params=params)
303301

304302
# 先删除现有的 ChromaDB 数据(仅删除chunks,保留元数据)
305303
await self.delete_file_chunks_only(db_id, file_id)

src/knowledge/implementations/lightrag.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from src import config
1212
from src.knowledge.base import KnowledgeBase
13-
from src.knowledge.indexing import process_file_to_markdown, process_url_to_markdown
13+
from src.knowledge.indexing import process_file_to_markdown
1414
from src.knowledge.utils.kb_utils import get_embedding_config, prepare_item_metadata
1515
from src.utils import hashstr, logger
1616
from src.utils.datetime_utils import shanghai_now
@@ -243,12 +243,11 @@ async def add_content(self, db_id: str, items: list[str], params: dict | None =
243243
params["db_id"] = db_id
244244

245245
# 根据内容类型处理内容
246-
if content_type == "file":
247-
markdown_content = await process_file_to_markdown(item, params=params)
248-
markdown_content_lines = markdown_content[:100].replace("\n", " ")
249-
logger.info(f"Markdown content: {markdown_content_lines}...")
250-
else: # URL
251-
markdown_content = await process_url_to_markdown(item, params=params)
246+
if content_type != "file":
247+
raise ValueError("URL 内容解析已禁用")
248+
markdown_content = await process_file_to_markdown(item, params=params)
249+
markdown_content_lines = markdown_content[:100].replace("\n", " ")
250+
logger.info(f"Markdown content: {markdown_content_lines}...")
252251

253252
# 使用 LightRAG 插入内容
254253
await rag.ainsert(input=markdown_content, ids=file_id, file_paths=item_path)
@@ -313,12 +312,11 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
313312
self._save_metadata()
314313

315314
# 重新解析文件为 markdown
316-
if content_type == "file":
317-
markdown_content = await process_file_to_markdown(file_path, params=params)
318-
markdown_content_lines = markdown_content[:100].replace("\n", " ")
319-
logger.info(f"Markdown content: {markdown_content_lines}...")
320-
else:
321-
markdown_content = await process_url_to_markdown(file_path, params=params)
315+
if content_type != "file":
316+
raise ValueError("URL 内容解析已禁用")
317+
markdown_content = await process_file_to_markdown(file_path, params=params)
318+
markdown_content_lines = markdown_content[:100].replace("\n", " ")
319+
logger.info(f"Markdown content: {markdown_content_lines}...")
322320

323321
# 先删除现有的 LightRAG 数据(仅删除chunks,保留元数据)
324322
await self.delete_file_chunks_only(db_id, file_id)

src/knowledge/implementations/milvus.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, db, utility
99

1010
from src.knowledge.base import KnowledgeBase
11-
from src.knowledge.indexing import process_file_to_markdown, process_url_to_markdown
11+
from src.knowledge.indexing import process_file_to_markdown
1212
from src.knowledge.utils.kb_utils import (
1313
get_embedding_config,
1414
prepare_item_metadata,
@@ -247,10 +247,9 @@ async def add_content(self, db_id: str, items: list[str], params: dict | None =
247247
params = {}
248248
params["db_id"] = db_id
249249

250-
if content_type == "file":
251-
markdown_content = await process_file_to_markdown(item, params=params)
252-
else:
253-
markdown_content = await process_url_to_markdown(item, params=params)
250+
if content_type != "file":
251+
raise ValueError("URL 内容解析已禁用")
252+
markdown_content = await process_file_to_markdown(item, params=params)
254253

255254
chunks = self._split_text_into_chunks(markdown_content, file_id, filename, params)
256255
logger.info(f"Split {filename} into {len(chunks)} chunks")
@@ -342,10 +341,9 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
342341
self._save_metadata()
343342

344343
# 重新解析文件为 markdown
345-
if content_type == "file":
346-
markdown_content = await process_file_to_markdown(file_path, params=params)
347-
else:
348-
markdown_content = await process_url_to_markdown(file_path, params=params)
344+
if content_type != "file":
345+
raise ValueError("URL 内容解析已禁用")
346+
markdown_content = await process_file_to_markdown(file_path, params=params)
349347

350348
# 先删除现有的 Milvus 数据(仅删除chunks,保留元数据)
351349
await self.delete_file_chunks_only(db_id, file_id)

src/knowledge/indexing.py

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -551,24 +551,4 @@ def replace_link(match):
551551

552552

553553
async def process_url_to_markdown(url: str, params: dict | None = None) -> str:
554-
"""
555-
将URL转换为markdown格式
556-
557-
Args:
558-
url: URL地址
559-
params: 处理参数
560-
561-
Returns:
562-
markdown格式内容
563-
"""
564-
import requests
565-
from bs4 import BeautifulSoup
566-
567-
try:
568-
response = requests.get(url, timeout=30)
569-
soup = BeautifulSoup(response.content, "html.parser")
570-
text_content = soup.get_text()
571-
return f"# {url}\n\n{text_content}"
572-
except Exception as e:
573-
logger.error(f"Failed to process URL {url}: {e}")
574-
return f"# {url}\n\nFailed to process URL: {e}"
554+
raise NotImplementedError("URL 解析功能已禁用")

src/knowledge/utils/kb_utils.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -150,12 +150,8 @@ def prepare_item_metadata(item: str, content_type: str, db_id: str, params: dict
150150
content_hash = calculate_content_hash(file_path)
151151
except Exception as exc: # noqa: BLE001
152152
logger.warning(f"Failed to calculate content hash for {file_path}: {exc}")
153-
else: # URL
154-
file_id = f"url_{hashstr(item + str(time.time()), 6)}"
155-
file_type = "url"
156-
filename = f"webpage_{hashstr(item, 6)}.md"
157-
item_path = item
158-
content_hash = None
153+
else:
154+
raise ValueError("URL 元数据生成已禁用")
159155

160156
metadata = {
161157
"database_id": db_id,

web/src/components/FileUploadModal.vue

Lines changed: 12 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
type="primary"
1313
@click="chunkData"
1414
:loading="chunkLoading"
15-
:disabled="(uploadMode === 'file' && fileList.length === 0) || (uploadMode === 'url' && !urlList.trim())"
15+
:disabled="fileList.length === 0"
1616
>
1717
添加到知识库
1818
</a-button>
@@ -26,6 +26,7 @@
2626
:options="uploadModeOptions"
2727
size="large"
2828
class="source-segmented"
29+
:disabled="true"
2930
/>
3031
</div>
3132
<div class="config-controls">
@@ -101,24 +102,7 @@
101102
</a-upload-dragger>
102103
</div>
103104

104-
105-
106-
<!-- URL 输入区域 -->
107-
<div class="url-input" v-if="uploadMode === 'url'">
108-
<a-form layout="vertical">
109-
<a-form-item label="网页链接 (每行一个URL)">
110-
<a-textarea
111-
v-model:value="urlList"
112-
placeholder="请输入网页链接,每行一个"
113-
:rows="6"
114-
:disabled="chunkLoading"
115-
/>
116-
</a-form-item>
117-
</a-form>
118-
<p class="url-hint">
119-
支持添加网页内容,系统会自动抓取网页文本并进行分块。请确保URL格式正确且可以公开访问。
120-
</p>
121-
</div>
105+
122106
</div>
123107
</a-modal>
124108

@@ -139,7 +123,7 @@
139123

140124
<script setup>
141125
import { ref, computed, onMounted, watch } from 'vue';
142-
import { message, Upload } from 'ant-design-vue';
126+
import { message, Upload, Tooltip } from 'ant-design-vue';
143127
import { useUserStore } from '@/stores/user';
144128
import { useDatabaseStore } from '@/stores/database';
145129
import { ocrApi } from '@/apis/system_api';
@@ -269,19 +253,20 @@ const uploadModeOptions = computed(() => [
269253
},
270254
{
271255
value: 'url',
272-
label: h('div', { class: 'segmented-option' }, [
273-
h(LinkOutlined, { class: 'option-icon' }),
274-
h('span', { class: 'option-text' }, '输入网址'),
275-
]),
256+
label: h(Tooltip, { title: 'URL 文档上传与解析功能已禁用,出于安全考虑,当前版本仅支持文件上传' }, {
257+
default: () => h('div', { class: 'segmented-option' }, [
258+
h(LinkOutlined, { class: 'option-icon' }),
259+
h('span', { class: 'option-text' }, '输入网址'),
260+
])
261+
}),
276262
},
277263
]);
278264
279265
// 文件列表
280266
const fileList = ref([]);
281267
282268
283-
// URL列表
284-
const urlList = ref('');
269+
// URL相关功能已移除
285270
286271
// OCR服务健康状态
287272
const ocrHealthStatus = ref({
@@ -330,14 +315,7 @@ const isOcrEnabled = computed(() => {
330315
return chunkParams.value.enable_ocr !== 'disable';
331316
});
332317
333-
watch(uploadMode, (mode, previous) => {
334-
if (mode === 'url') {
335-
previousOcrSelection.value = chunkParams.value.enable_ocr;
336-
chunkParams.value.enable_ocr = 'disable';
337-
} else if (mode === 'file' && previous === 'url') {
338-
chunkParams.value.enable_ocr = previousOcrSelection.value || 'disable';
339-
}
340-
});
318+
// 上传模式切换相关逻辑已移除
341319
342320
// 计算属性:是否有PDF或图片文件
343321
const hasPdfOrImageFiles = computed(() => {
@@ -630,31 +608,11 @@ const chunkData = async () => {
630608
} finally {
631609
store.state.chunkLoading = false;
632610
}
633-
} else if (uploadMode.value === 'url') {
634-
const urls = urlList.value.split('\n')
635-
.map(url => url.trim())
636-
.filter(url => url.length > 0 && (url.startsWith('http://') || url.startsWith('https://')));
637-
638-
if (urls.length === 0) {
639-
message.error('请输入有效的网页链接(必须以http://或https://开头)');
640-
return;
641-
}
642-
643-
try {
644-
store.state.chunkLoading = true;
645-
success = await store.addFiles({ items: urls, contentType: 'url', params: chunkParams.value });
646-
} catch (error) {
647-
console.error('URL上传失败:', error);
648-
message.error('URL上传失败: ' + (error.message || '未知错误'));
649-
} finally {
650-
store.state.chunkLoading = false;
651-
}
652611
}
653612
654613
if (success) {
655614
emit('update:visible', false);
656615
fileList.value = [];
657-
urlList.value = '';
658616
}
659617
};
660618

0 commit comments

Comments
 (0)