xerrors
diff --git a/‎server/routers/base_router.py‎
Lines changed: 109 additions & 0 deletions b/‎server/routers/base_router.py‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎src/__init__.py‎
Lines changed: 28 additions & 2 deletions b/‎src/__init__.py‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎src/core/indexing.py‎
Lines changed: 42 additions & 12 deletions b/‎src/core/indexing.py‎
Lines changed: 42 additions & 12 deletions
diff --git a/‎src/core/knowledge_base.py‎
Lines changed: 1 addition & 1 deletion b/‎src/core/knowledge_base.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/core/lightrag_kb.py‎
Lines changed: 4 additions & 1 deletion b/‎src/core/lightrag_kb.py‎
Lines changed: 4 additions & 1 deletion
@@ -1,5 +1,7 @@
 import os
 import yaml
+import asyncio
+import requests
 from pathlib import Path
 from fastapi import Request, Body, Depends, HTTPException
 from fastapi import APIRouter
@@ -133,4 +135,111 @@ async def reload_info_config():
         logger.error(f"重新加载信息配置失败: {e}")
         raise HTTPException(status_code=500, detail="重新加载信息配置失败")
 
+@base.get("/ocr/health")
+async def check_ocr_services_health(current_user: User = Depends(get_admin_user)):
+    """
+    检查所有OCR服务的健康状态
+    返回各个OCR服务的可用性信息
+    """
+    health_status = {
+        "rapid_ocr": {"status": "unknown", "message": ""},
+        "mineru_ocr": {"status": "unknown", "message": ""},
+        "paddlex_ocr": {"status": "unknown", "message": ""}
+    }
+
+    # 检查 RapidOCR (ONNX) 模型
+    try:
+        model_dir = os.path.join(os.getenv("MODEL_DIR", ""), "SWHL/RapidOCR")
+        det_model_path = os.path.join(model_dir, "PP-OCRv4/ch_PP-OCRv4_det_infer.onnx")
+        rec_model_path = os.path.join(model_dir, "PP-OCRv4/ch_PP-OCRv4_rec_infer.onnx")
+
+        if os.path.exists(model_dir) and os.path.exists(det_model_path) and os.path.exists(rec_model_path):
+            # 尝试初始化RapidOCR
+            from rapidocr_onnxruntime import RapidOCR
+            test_ocr = RapidOCR(det_box_thresh=0.3, det_model_path=det_model_path, rec_model_path=rec_model_path)
+            health_status["rapid_ocr"]["status"] = "healthy"
+            health_status["rapid_ocr"]["message"] = "RapidOCR模型已加载"
+        else:
+            health_status["rapid_ocr"]["status"] = "unavailable"
+            health_status["rapid_ocr"]["message"] = f"模型文件不存在: {model_dir}"
+    except Exception as e:
+        health_status["rapid_ocr"]["status"] = "error"
+        health_status["rapid_ocr"]["message"] = f"RapidOCR初始化失败: {str(e)}"
+
+    # 检查 MinerU OCR 服务
+    try:
+        mineru_uri = os.getenv("MINERU_OCR_URI", "http://localhost:30000")
+        health_url = f"{mineru_uri}/health"
+
+        response = requests.get(health_url, timeout=5)
+        if response.status_code == 200:
+            health_status["mineru_ocr"]["status"] = "healthy"
+            health_status["mineru_ocr"]["message"] = f"MinerU服务运行正常 ({mineru_uri})"
+        else:
+            health_status["mineru_ocr"]["status"] = "unhealthy"
+            health_status["mineru_ocr"]["message"] = f"MinerU服务响应异常: {response.status_code}"
+    except requests.exceptions.ConnectionError:
+        health_status["mineru_ocr"]["status"] = "unavailable"
+        health_status["mineru_ocr"]["message"] = "MinerU服务无法连接，请检查服务是否启动"
+    except requests.exceptions.Timeout:
+        health_status["mineru_ocr"]["status"] = "timeout"
+        health_status["mineru_ocr"]["message"] = "MinerU服务连接超时"
+    except Exception as e:
+        health_status["mineru_ocr"]["status"] = "error"
+        health_status["mineru_ocr"]["message"] = f"MinerU服务检查失败: {str(e)}"
+
+    # 检查 PaddleX OCR 服务
+    try:
+        paddlex_uri = os.getenv("PADDLEX_URI", "http://localhost:8080")
+        health_url = f"{paddlex_uri}/health"
+
+        response = requests.get(health_url, timeout=5)
+        if response.status_code == 200:
+            health_status["paddlex_ocr"]["status"] = "healthy"
+            health_status["paddlex_ocr"]["message"] = f"PaddleX服务运行正常 ({paddlex_uri})"
+        else:
+            health_status["paddlex_ocr"]["status"] = "unhealthy"
+            health_status["paddlex_ocr"]["message"] = f"PaddleX服务响应异常: {response.status_code}"
+    except requests.exceptions.ConnectionError:
+        health_status["paddlex_ocr"]["status"] = "unavailable"
+        health_status["paddlex_ocr"]["message"] = "PaddleX服务无法连接，请检查服务是否启动"
+    except requests.exceptions.Timeout:
+        health_status["paddlex_ocr"]["status"] = "timeout"
+        health_status["paddlex_ocr"]["message"] = "PaddleX服务连接超时"
+    except Exception as e:
+        health_status["paddlex_ocr"]["status"] = "error"
+        health_status["paddlex_ocr"]["message"] = f"PaddleX服务检查失败: {str(e)}"
+
+    # 计算整体健康状态
+    overall_status = "healthy" if any(svc["status"] == "healthy" for svc in health_status.values()) else "unhealthy"
+
+    return {
+        "overall_status": overall_status,
+        "services": health_status,
+        "message": "OCR服务健康检查完成"
+    }
+
+@base.get("/ocr/stats")
+async def get_ocr_stats(current_user: User = Depends(get_admin_user)):
+    """
+    获取OCR服务使用统计信息
+    返回各个OCR服务的处理统计和性能指标
+    """
+    try:
+        from src.plugins._ocr import get_ocr_stats
+        stats = get_ocr_stats()
+
+        return {
+            "status": "success",
+            "stats": stats,
+            "message": "OCR统计信息获取成功"
+        }
+    except Exception as e:
+        logger.error(f"获取OCR统计信息失败: {str(e)}")
+        return {
+            "status": "error",
+            "stats": {},
+            "message": f"获取OCR统计信息失败: {str(e)}"
+        }
+
 
@@ -1,3 +1,4 @@
+import os
 from dotenv import load_dotenv
 
 load_dotenv("src/.env", override=True)
@@ -8,8 +9,33 @@
 from src.config import Config  # noqa: E402
 config = Config()
 
-from src.core.lightrag_based_kb import LightRagBasedKB  # noqa: E402
-knowledge_base = LightRagBasedKB()
+# 导入知识库相关模块
+from src.core.kb_factory import KnowledgeBaseFactory  # noqa: E402
+from src.core.kb_manager import KnowledgeBaseManager  # noqa: E402
+from src.core.lightrag_kb import LightRagKB  # noqa: E402
+from src.core.chroma_kb import ChromaKB  # noqa: E402
+from src.core.milvus_kb import MilvusKB  # noqa: E402
+
+# 注册知识库类型
+KnowledgeBaseFactory.register("lightrag", LightRagKB, {
+    "description": "基于图检索的知识库，支持实体关系构建和复杂查询"
+})
+
+KnowledgeBaseFactory.register("chroma", ChromaKB, {
+    "chunk_size": 1000,
+    "chunk_overlap": 200,
+    "description": "基于 ChromaDB 的轻量级向量知识库，适合开发和小规模部署"
+})
+
+KnowledgeBaseFactory.register("milvus", MilvusKB, {
+    "chunk_size": 1000,
+    "chunk_overlap": 200,
+    "description": "基于 Milvus 的生产级向量知识库，适合大规模高性能部署"
+})
+
+# 创建知识库管理器
+work_dir = os.path.join(config.save_dir, "knowledge_base_data")
+knowledge_base = KnowledgeBaseManager(work_dir)
 
 from src.core import GraphDatabase  # noqa: E402
 graph_base = GraphDatabase()
@@ -122,23 +122,53 @@ def plainreader(file_path):
     return text
 
 def parse_pdf(file, params=None):
-    params = params or {}
-    opt_ocr = params.get("enable_ocr", "disable")
+    """
+    解析PDF文件，支持多种OCR方式
+
+    Args:
+        file: PDF文件路径
+        params: 参数字典，包含enable_ocr设置
 
-    if opt_ocr == "onnx_rapid_ocr":
-        from src.plugins import ocr
-        return ocr.process_pdf(file)
+    Returns:
+        str: 解析得到的文本
 
-    elif opt_ocr == "mineru_ocr":
-        from src.plugins import ocr
-        return ocr.process_pdf_mineru(file)
+    Raises:
+        OCRServiceException: OCR服务不可用时抛出
+    """
+    from src.plugins._ocr import OCRServiceException
 
-    elif opt_ocr == "paddlex_ocr":
-        from src.plugins import ocr
-        return ocr.process_pdf_paddlex(file)
+    params = params or {}
+    opt_ocr = params.get("enable_ocr", "disable")
 
-    else:
+    if opt_ocr == "disable":
         return pdfreader(file, params=params)
 
+    try:
+        if opt_ocr == "onnx_rapid_ocr":
+            from src.plugins import ocr
+            return ocr.process_pdf(file)
+
+        elif opt_ocr == "mineru_ocr":
+            from src.plugins import ocr
+            return ocr.process_pdf_mineru(file)
+
+        elif opt_ocr == "paddlex_ocr":
+            from src.plugins import ocr
+            return ocr.process_pdf_paddlex(file)
+
+        else:
+            return pdfreader(file, params=params)
+
+    except OCRServiceException as e:
+        logger.error(f"OCR service failed: {e.service_name} - {str(e)}")
+        raise
+    except Exception as e:
+        logger.error(f"PDF parsing failed: {str(e)}")
+        raise OCRServiceException(
+            f"PDF解析失败: {str(e)}",
+            opt_ocr,
+            "parsing_failed"
+        )
+
 async def parse_pdf_async(file, params=None):
     return await asyncio.to_thread(parse_pdf, file, params=params)
@@ -390,7 +390,7 @@ async def _process_file_to_markdown(self, file_path: str,
             # 使用 OCR 处理 PDF
             from src.core.indexing import parse_pdf_async
             text = await parse_pdf_async(str(file_path_obj), params=params)
-            return f"Using OCR to process {file_path_obj.name}\n\n{text}"
+            return f"# {file_path_obj.name}\n\n{text}"
 
         elif file_ext in ['.txt', '.md']:
             # 直接读取文本文件
 
@@ -213,10 +213,13 @@ async def add_content(self, db_id: str, items: List[str],
                 file_record['status'] = "done"
 
             except Exception as e:
-                logger.error(f"处理{content_type} {item} 失败: {e}, {traceback.format_exc()}")
+                error_msg = str(e)
+                logger.error(f"处理{content_type} {item} 失败: {error_msg}, {traceback.format_exc()}")
                 self.files_meta[file_id]["status"] = "failed"
+                self.files_meta[file_id]["error"] = error_msg
                 self._save_metadata()
                 file_record['status'] = "failed"
+                file_record['error'] = error_msg
 
             processed_items_info.append(file_record)