Skip to content

Commit 84039ae

Browse files
committed
feat: 添加OCR服务健康检查和统计功能,优化PDF解析逻辑,增强错误处理机制
1 parent 14d95ce commit 84039ae

File tree

8 files changed

+613
-55
lines changed

8 files changed

+613
-55
lines changed

server/routers/base_router.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
22
import yaml
3+
import asyncio
4+
import requests
35
from pathlib import Path
46
from fastapi import Request, Body, Depends, HTTPException
57
from fastapi import APIRouter
@@ -133,4 +135,111 @@ async def reload_info_config():
133135
logger.error(f"重新加载信息配置失败: {e}")
134136
raise HTTPException(status_code=500, detail="重新加载信息配置失败")
135137

138+
@base.get("/ocr/health")
139+
async def check_ocr_services_health(current_user: User = Depends(get_admin_user)):
140+
"""
141+
检查所有OCR服务的健康状态
142+
返回各个OCR服务的可用性信息
143+
"""
144+
health_status = {
145+
"rapid_ocr": {"status": "unknown", "message": ""},
146+
"mineru_ocr": {"status": "unknown", "message": ""},
147+
"paddlex_ocr": {"status": "unknown", "message": ""}
148+
}
149+
150+
# 检查 RapidOCR (ONNX) 模型
151+
try:
152+
model_dir = os.path.join(os.getenv("MODEL_DIR", ""), "SWHL/RapidOCR")
153+
det_model_path = os.path.join(model_dir, "PP-OCRv4/ch_PP-OCRv4_det_infer.onnx")
154+
rec_model_path = os.path.join(model_dir, "PP-OCRv4/ch_PP-OCRv4_rec_infer.onnx")
155+
156+
if os.path.exists(model_dir) and os.path.exists(det_model_path) and os.path.exists(rec_model_path):
157+
# 尝试初始化RapidOCR
158+
from rapidocr_onnxruntime import RapidOCR
159+
test_ocr = RapidOCR(det_box_thresh=0.3, det_model_path=det_model_path, rec_model_path=rec_model_path)
160+
health_status["rapid_ocr"]["status"] = "healthy"
161+
health_status["rapid_ocr"]["message"] = "RapidOCR模型已加载"
162+
else:
163+
health_status["rapid_ocr"]["status"] = "unavailable"
164+
health_status["rapid_ocr"]["message"] = f"模型文件不存在: {model_dir}"
165+
except Exception as e:
166+
health_status["rapid_ocr"]["status"] = "error"
167+
health_status["rapid_ocr"]["message"] = f"RapidOCR初始化失败: {str(e)}"
168+
169+
# 检查 MinerU OCR 服务
170+
try:
171+
mineru_uri = os.getenv("MINERU_OCR_URI", "http://localhost:30000")
172+
health_url = f"{mineru_uri}/health"
173+
174+
response = requests.get(health_url, timeout=5)
175+
if response.status_code == 200:
176+
health_status["mineru_ocr"]["status"] = "healthy"
177+
health_status["mineru_ocr"]["message"] = f"MinerU服务运行正常 ({mineru_uri})"
178+
else:
179+
health_status["mineru_ocr"]["status"] = "unhealthy"
180+
health_status["mineru_ocr"]["message"] = f"MinerU服务响应异常: {response.status_code}"
181+
except requests.exceptions.ConnectionError:
182+
health_status["mineru_ocr"]["status"] = "unavailable"
183+
health_status["mineru_ocr"]["message"] = "MinerU服务无法连接,请检查服务是否启动"
184+
except requests.exceptions.Timeout:
185+
health_status["mineru_ocr"]["status"] = "timeout"
186+
health_status["mineru_ocr"]["message"] = "MinerU服务连接超时"
187+
except Exception as e:
188+
health_status["mineru_ocr"]["status"] = "error"
189+
health_status["mineru_ocr"]["message"] = f"MinerU服务检查失败: {str(e)}"
190+
191+
# 检查 PaddleX OCR 服务
192+
try:
193+
paddlex_uri = os.getenv("PADDLEX_URI", "http://localhost:8080")
194+
health_url = f"{paddlex_uri}/health"
195+
196+
response = requests.get(health_url, timeout=5)
197+
if response.status_code == 200:
198+
health_status["paddlex_ocr"]["status"] = "healthy"
199+
health_status["paddlex_ocr"]["message"] = f"PaddleX服务运行正常 ({paddlex_uri})"
200+
else:
201+
health_status["paddlex_ocr"]["status"] = "unhealthy"
202+
health_status["paddlex_ocr"]["message"] = f"PaddleX服务响应异常: {response.status_code}"
203+
except requests.exceptions.ConnectionError:
204+
health_status["paddlex_ocr"]["status"] = "unavailable"
205+
health_status["paddlex_ocr"]["message"] = "PaddleX服务无法连接,请检查服务是否启动"
206+
except requests.exceptions.Timeout:
207+
health_status["paddlex_ocr"]["status"] = "timeout"
208+
health_status["paddlex_ocr"]["message"] = "PaddleX服务连接超时"
209+
except Exception as e:
210+
health_status["paddlex_ocr"]["status"] = "error"
211+
health_status["paddlex_ocr"]["message"] = f"PaddleX服务检查失败: {str(e)}"
212+
213+
# 计算整体健康状态
214+
overall_status = "healthy" if any(svc["status"] == "healthy" for svc in health_status.values()) else "unhealthy"
215+
216+
return {
217+
"overall_status": overall_status,
218+
"services": health_status,
219+
"message": "OCR服务健康检查完成"
220+
}
221+
222+
@base.get("/ocr/stats")
223+
async def get_ocr_stats(current_user: User = Depends(get_admin_user)):
224+
"""
225+
获取OCR服务使用统计信息
226+
返回各个OCR服务的处理统计和性能指标
227+
"""
228+
try:
229+
from src.plugins._ocr import get_ocr_stats
230+
stats = get_ocr_stats()
231+
232+
return {
233+
"status": "success",
234+
"stats": stats,
235+
"message": "OCR统计信息获取成功"
236+
}
237+
except Exception as e:
238+
logger.error(f"获取OCR统计信息失败: {str(e)}")
239+
return {
240+
"status": "error",
241+
"stats": {},
242+
"message": f"获取OCR统计信息失败: {str(e)}"
243+
}
244+
136245

src/__init__.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from dotenv import load_dotenv
23

34
load_dotenv("src/.env", override=True)
@@ -8,8 +9,33 @@
89
from src.config import Config # noqa: E402
910
config = Config()
1011

11-
from src.core.lightrag_based_kb import LightRagBasedKB # noqa: E402
12-
knowledge_base = LightRagBasedKB()
12+
# 导入知识库相关模块
13+
from src.core.kb_factory import KnowledgeBaseFactory # noqa: E402
14+
from src.core.kb_manager import KnowledgeBaseManager # noqa: E402
15+
from src.core.lightrag_kb import LightRagKB # noqa: E402
16+
from src.core.chroma_kb import ChromaKB # noqa: E402
17+
from src.core.milvus_kb import MilvusKB # noqa: E402
18+
19+
# 注册知识库类型
20+
KnowledgeBaseFactory.register("lightrag", LightRagKB, {
21+
"description": "基于图检索的知识库,支持实体关系构建和复杂查询"
22+
})
23+
24+
KnowledgeBaseFactory.register("chroma", ChromaKB, {
25+
"chunk_size": 1000,
26+
"chunk_overlap": 200,
27+
"description": "基于 ChromaDB 的轻量级向量知识库,适合开发和小规模部署"
28+
})
29+
30+
KnowledgeBaseFactory.register("milvus", MilvusKB, {
31+
"chunk_size": 1000,
32+
"chunk_overlap": 200,
33+
"description": "基于 Milvus 的生产级向量知识库,适合大规模高性能部署"
34+
})
35+
36+
# 创建知识库管理器
37+
work_dir = os.path.join(config.save_dir, "knowledge_base_data")
38+
knowledge_base = KnowledgeBaseManager(work_dir)
1339

1440
from src.core import GraphDatabase # noqa: E402
1541
graph_base = GraphDatabase()

src/core/indexing.py

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -122,23 +122,53 @@ def plainreader(file_path):
122122
return text
123123

124124
def parse_pdf(file, params=None):
125-
params = params or {}
126-
opt_ocr = params.get("enable_ocr", "disable")
125+
"""
126+
解析PDF文件,支持多种OCR方式
127+
128+
Args:
129+
file: PDF文件路径
130+
params: 参数字典,包含enable_ocr设置
127131
128-
if opt_ocr == "onnx_rapid_ocr":
129-
from src.plugins import ocr
130-
return ocr.process_pdf(file)
132+
Returns:
133+
str: 解析得到的文本
131134
132-
elif opt_ocr == "mineru_ocr":
133-
from src.plugins import ocr
134-
return ocr.process_pdf_mineru(file)
135+
Raises:
136+
OCRServiceException: OCR服务不可用时抛出
137+
"""
138+
from src.plugins._ocr import OCRServiceException
135139

136-
elif opt_ocr == "paddlex_ocr":
137-
from src.plugins import ocr
138-
return ocr.process_pdf_paddlex(file)
140+
params = params or {}
141+
opt_ocr = params.get("enable_ocr", "disable")
139142

140-
else:
143+
if opt_ocr == "disable":
141144
return pdfreader(file, params=params)
142145

146+
try:
147+
if opt_ocr == "onnx_rapid_ocr":
148+
from src.plugins import ocr
149+
return ocr.process_pdf(file)
150+
151+
elif opt_ocr == "mineru_ocr":
152+
from src.plugins import ocr
153+
return ocr.process_pdf_mineru(file)
154+
155+
elif opt_ocr == "paddlex_ocr":
156+
from src.plugins import ocr
157+
return ocr.process_pdf_paddlex(file)
158+
159+
else:
160+
return pdfreader(file, params=params)
161+
162+
except OCRServiceException as e:
163+
logger.error(f"OCR service failed: {e.service_name} - {str(e)}")
164+
raise
165+
except Exception as e:
166+
logger.error(f"PDF parsing failed: {str(e)}")
167+
raise OCRServiceException(
168+
f"PDF解析失败: {str(e)}",
169+
opt_ocr,
170+
"parsing_failed"
171+
)
172+
143173
async def parse_pdf_async(file, params=None):
144174
return await asyncio.to_thread(parse_pdf, file, params=params)

src/core/knowledge_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ async def _process_file_to_markdown(self, file_path: str,
390390
# 使用 OCR 处理 PDF
391391
from src.core.indexing import parse_pdf_async
392392
text = await parse_pdf_async(str(file_path_obj), params=params)
393-
return f"Using OCR to process {file_path_obj.name}\n\n{text}"
393+
return f"# {file_path_obj.name}\n\n{text}"
394394

395395
elif file_ext in ['.txt', '.md']:
396396
# 直接读取文本文件

src/core/lightrag_kb.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,10 +213,13 @@ async def add_content(self, db_id: str, items: List[str],
213213
file_record['status'] = "done"
214214

215215
except Exception as e:
216-
logger.error(f"处理{content_type} {item} 失败: {e}, {traceback.format_exc()}")
216+
error_msg = str(e)
217+
logger.error(f"处理{content_type} {item} 失败: {error_msg}, {traceback.format_exc()}")
217218
self.files_meta[file_id]["status"] = "failed"
219+
self.files_meta[file_id]["error"] = error_msg
218220
self._save_metadata()
219221
file_record['status'] = "failed"
222+
file_record['error'] = error_msg
220223

221224
processed_items_info.append(file_record)
222225

0 commit comments

Comments
 (0)