fix: 禁用URL文档上传与解析功能

xerrors · xerrors · commit 0ff771dc1933 · 2025-11-23T14:57:38.000+08:00
出于安全考虑，移除所有URL文档上传与解析相关功能：
1. 在知识库路由层添加URL上传校验
2. 移除URL元数据生成逻辑
3. 禁用URL转markdown功能
4. 在各知识库实现中移除URL处理逻辑
5. 在前端禁用URL上传选项并添加提示
diff --git a/server/routers/knowledge_router.py b/server/routers/knowledge_router.py
@@ -210,6 +210,10 @@ async def add_documents(
 
     content_type = params.get("content_type", "file")
 
+    # 禁止 URL 解析与入库
+    if content_type == "url":
+        raise HTTPException(status_code=400, detail="URL 文档上传与解析已禁用")
+
     # 安全检查：验证文件路径
     if content_type == "file":
         from src.knowledge.utils.kb_utils import validate_file_path
diff --git a/src/knowledge/implementations/chroma.py b/src/knowledge/implementations/chroma.py
@@ -9,7 +9,7 @@
 from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
 
 from src.knowledge.base import KnowledgeBase
-from src.knowledge.indexing import process_file_to_markdown, process_url_to_markdown
+from src.knowledge.indexing import process_file_to_markdown
 from src.knowledge.utils.kb_utils import (
     get_embedding_config,
     prepare_item_metadata,
@@ -204,10 +204,9 @@ async def add_content(self, db_id: str, items: list[str], params: dict | None) -
                 params["db_id"] = db_id
 
                 # 根据内容类型处理内容
-                if content_type == "file":
-                    markdown_content = await process_file_to_markdown(item, params=params)
-                else:  # URL
-                    markdown_content = await process_url_to_markdown(item, params=params)
+                if content_type != "file":
+                    raise ValueError("URL 内容解析已禁用")
+                markdown_content = await process_file_to_markdown(item, params=params)
 
                 # 分割文本成块
                 chunks = self._split_text_into_chunks(markdown_content, file_id, filename, params)
@@ -296,10 +295,9 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
                 self._save_metadata()
 
                 # 重新解析文件为 markdown
-                if content_type == "file":
-                    markdown_content = await process_file_to_markdown(file_path, params=params)
-                else:
-                    markdown_content = await process_url_to_markdown(file_path, params=params)
+                if content_type != "file":
+                    raise ValueError("URL 内容解析已禁用")
+                markdown_content = await process_file_to_markdown(file_path, params=params)
 
                 # 先删除现有的 ChromaDB 数据（仅删除chunks，保留元数据）
                 await self.delete_file_chunks_only(db_id, file_id)
diff --git a/src/knowledge/implementations/lightrag.py b/src/knowledge/implementations/lightrag.py
@@ -10,7 +10,7 @@
 
 from src import config
 from src.knowledge.base import KnowledgeBase
-from src.knowledge.indexing import process_file_to_markdown, process_url_to_markdown
+from src.knowledge.indexing import process_file_to_markdown
 from src.knowledge.utils.kb_utils import get_embedding_config, prepare_item_metadata
 from src.utils import hashstr, logger
 from src.utils.datetime_utils import shanghai_now
@@ -243,12 +243,11 @@ async def add_content(self, db_id: str, items: list[str], params: dict | None =
                 params["db_id"] = db_id
 
                 # 根据内容类型处理内容
-                if content_type == "file":
-                    markdown_content = await process_file_to_markdown(item, params=params)
-                    markdown_content_lines = markdown_content[:100].replace("\n", " ")
-                    logger.info(f"Markdown content: {markdown_content_lines}...")
-                else:  # URL
-                    markdown_content = await process_url_to_markdown(item, params=params)
+                if content_type != "file":
+                    raise ValueError("URL 内容解析已禁用")
+                markdown_content = await process_file_to_markdown(item, params=params)
+                markdown_content_lines = markdown_content[:100].replace("\n", " ")
+                logger.info(f"Markdown content: {markdown_content_lines}...")
 
                 # 使用 LightRAG 插入内容
                 await rag.ainsert(input=markdown_content, ids=file_id, file_paths=item_path)
@@ -313,12 +312,11 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
                 self._save_metadata()
 
                 # 重新解析文件为 markdown
-                if content_type == "file":
-                    markdown_content = await process_file_to_markdown(file_path, params=params)
-                    markdown_content_lines = markdown_content[:100].replace("\n", " ")
-                    logger.info(f"Markdown content: {markdown_content_lines}...")
-                else:
-                    markdown_content = await process_url_to_markdown(file_path, params=params)
+                if content_type != "file":
+                    raise ValueError("URL 内容解析已禁用")
+                markdown_content = await process_file_to_markdown(file_path, params=params)
+                markdown_content_lines = markdown_content[:100].replace("\n", " ")
+                logger.info(f"Markdown content: {markdown_content_lines}...")
 
                 # 先删除现有的 LightRAG 数据（仅删除chunks，保留元数据）
                 await self.delete_file_chunks_only(db_id, file_id)
diff --git a/src/knowledge/implementations/milvus.py b/src/knowledge/implementations/milvus.py
@@ -8,7 +8,7 @@
 from pymilvus import Collection, CollectionSchema, DataType, FieldSchema, connections, db, utility
 
 from src.knowledge.base import KnowledgeBase
-from src.knowledge.indexing import process_file_to_markdown, process_url_to_markdown
+from src.knowledge.indexing import process_file_to_markdown
 from src.knowledge.utils.kb_utils import (
     get_embedding_config,
     prepare_item_metadata,
@@ -247,10 +247,9 @@ async def add_content(self, db_id: str, items: list[str], params: dict | None =
                     params = {}
                 params["db_id"] = db_id
 
-                if content_type == "file":
-                    markdown_content = await process_file_to_markdown(item, params=params)
-                else:
-                    markdown_content = await process_url_to_markdown(item, params=params)
+                if content_type != "file":
+                    raise ValueError("URL 内容解析已禁用")
+                markdown_content = await process_file_to_markdown(item, params=params)
 
                 chunks = self._split_text_into_chunks(markdown_content, file_id, filename, params)
                 logger.info(f"Split {filename} into {len(chunks)} chunks")
@@ -342,10 +341,9 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
                     self._save_metadata()
 
                 # 重新解析文件为 markdown
-                if content_type == "file":
-                    markdown_content = await process_file_to_markdown(file_path, params=params)
-                else:
-                    markdown_content = await process_url_to_markdown(file_path, params=params)
+                if content_type != "file":
+                    raise ValueError("URL 内容解析已禁用")
+                markdown_content = await process_file_to_markdown(file_path, params=params)
 
                 # 先删除现有的 Milvus 数据（仅删除chunks，保留元数据）
                 await self.delete_file_chunks_only(db_id, file_id)
diff --git a/src/knowledge/indexing.py b/src/knowledge/indexing.py
@@ -551,24 +551,4 @@ def replace_link(match):
 
 
 async def process_url_to_markdown(url: str, params: dict | None = None) -> str:
-    """
-    将URL转换为markdown格式
-
-    Args:
-        url: URL地址
-        params: 处理参数
-
-    Returns:
-        markdown格式内容
-    """
-    import requests
-    from bs4 import BeautifulSoup
-
-    try:
-        response = requests.get(url, timeout=30)
-        soup = BeautifulSoup(response.content, "html.parser")
-        text_content = soup.get_text()
-        return f"# {url}\n\n{text_content}"
-    except Exception as e:
-        logger.error(f"Failed to process URL {url}: {e}")
-        return f"# {url}\n\nFailed to process URL: {e}"
+    raise NotImplementedError("URL 解析功能已禁用")
diff --git a/src/knowledge/utils/kb_utils.py b/src/knowledge/utils/kb_utils.py
@@ -150,12 +150,8 @@ def prepare_item_metadata(item: str, content_type: str, db_id: str, params: dict
                 content_hash = calculate_content_hash(file_path)
         except Exception as exc:  # noqa: BLE001
             logger.warning(f"Failed to calculate content hash for {file_path}: {exc}")
-    else:  # URL
-        file_id = f"url_{hashstr(item + str(time.time()), 6)}"
-        file_type = "url"
-        filename = f"webpage_{hashstr(item, 6)}.md"
-        item_path = item
-        content_hash = None
+    else:
+        raise ValueError("URL 元数据生成已禁用")
 
     metadata = {
         "database_id": db_id,
diff --git a/web/src/components/FileUploadModal.vue b/web/src/components/FileUploadModal.vue
@@ -12,7 +12,7 @@
         type="primary"
         @click="chunkData"
         :loading="chunkLoading"
-        :disabled="(uploadMode === 'file' && fileList.length === 0) || (uploadMode === 'url' && !urlList.trim())"
+        :disabled="fileList.length === 0"
       >
         添加到知识库
       </a-button>
@@ -26,6 +26,7 @@
             :options="uploadModeOptions"
             size="large"
             class="source-segmented"
+            :disabled="true"
           />
         </div>
         <div class="config-controls">
@@ -101,24 +102,7 @@
         </a-upload-dragger>
       </div>
 
-
-
-      <!-- URL 输入区域 -->
-      <div class="url-input" v-if="uploadMode === 'url'">
-        <a-form layout="vertical">
-          <a-form-item label="网页链接 (每行一个URL)">
-            <a-textarea
-              v-model:value="urlList"
-              placeholder="请输入网页链接，每行一个"
-              :rows="6"
-              :disabled="chunkLoading"
-            />
-          </a-form-item>
-        </a-form>
-        <p class="url-hint">
-          支持添加网页内容，系统会自动抓取网页文本并进行分块。请确保URL格式正确且可以公开访问。
-        </p>
-      </div>
+      
     </div>
   </a-modal>
 
@@ -139,7 +123,7 @@
 
 <script setup>
 import { ref, computed, onMounted, watch } from 'vue';
-import { message, Upload } from 'ant-design-vue';
+import { message, Upload, Tooltip } from 'ant-design-vue';
 import { useUserStore } from '@/stores/user';
 import { useDatabaseStore } from '@/stores/database';
 import { ocrApi } from '@/apis/system_api';
@@ -269,19 +253,20 @@ const uploadModeOptions = computed(() => [
   },
   {
     value: 'url',
-    label: h('div', { class: 'segmented-option' }, [
-      h(LinkOutlined, { class: 'option-icon' }),
-      h('span', { class: 'option-text' }, '输入网址'),
-    ]),
+    label: h(Tooltip, { title: 'URL 文档上传与解析功能已禁用，出于安全考虑，当前版本仅支持文件上传' }, {
+      default: () => h('div', { class: 'segmented-option' }, [
+        h(LinkOutlined, { class: 'option-icon' }),
+        h('span', { class: 'option-text' }, '输入网址'),
+      ])
+    }),
   },
 ]);
 
 // 文件列表
 const fileList = ref([]);
 
 
-// URL列表
-const urlList = ref('');
+// URL相关功能已移除
 
 // OCR服务健康状态
 const ocrHealthStatus = ref({
@@ -330,14 +315,7 @@ const isOcrEnabled = computed(() => {
   return chunkParams.value.enable_ocr !== 'disable';
 });
 
-watch(uploadMode, (mode, previous) => {
-  if (mode === 'url') {
-    previousOcrSelection.value = chunkParams.value.enable_ocr;
-    chunkParams.value.enable_ocr = 'disable';
-  } else if (mode === 'file' && previous === 'url') {
-    chunkParams.value.enable_ocr = previousOcrSelection.value || 'disable';
-  }
-});
+// 上传模式切换相关逻辑已移除
 
 // 计算属性：是否有PDF或图片文件
 const hasPdfOrImageFiles = computed(() => {
@@ -630,31 +608,11 @@ const chunkData = async () => {
     } finally {
       store.state.chunkLoading = false;
     }
-  } else if (uploadMode.value === 'url') {
-    const urls = urlList.value.split('\n')
-      .map(url => url.trim())
-      .filter(url => url.length > 0 && (url.startsWith('http://') || url.startsWith('https://')));
-
-    if (urls.length === 0) {
-      message.error('请输入有效的网页链接（必须以http://或https://开头）');
-      return;
-    }
-
-    try {
-      store.state.chunkLoading = true;
-      success = await store.addFiles({ items: urls, contentType: 'url', params: chunkParams.value });
-    } catch (error) {
-      console.error('URL上传失败:', error);
-      message.error('URL上传失败: ' + (error.message || '未知错误'));
-    } finally {
-      store.state.chunkLoading = false;
-    }
   }
 
   if (success) {
     emit('update:visible', false);
     fileList.value = [];
-    urlList.value = '';
   }
 };