diff --git a/runtime/datamate-python/app/module/generation/service/generation_service.py b/runtime/datamate-python/app/module/generation/service/generation_service.py
index a540bfdf..22ee3f7e 100644
--- a/runtime/datamate-python/app/module/generation/service/generation_service.py
+++ b/runtime/datamate-python/app/module/generation/service/generation_service.py
@@ -25,6 +25,17 @@
from app.module.system.service.common_service import chat, get_model_by_id, get_chat_client
+def _filter_docs(split_docs, chunk_size):
+ """
+ 过滤文档,移除长度小于 chunk_size 的文档
+ """
+ filtered_docs = []
+ for doc in split_docs:
+ if len(doc.page_content) >= chunk_size * 0.7:
+ filtered_docs.append(doc)
+ return filtered_docs
+
+
class GenerationService:
def __init__(self, db: AsyncSession):
self.db = db
@@ -464,7 +475,7 @@ def _load_and_split(file_path: str, chunk_size: int, chunk_overlap: int):
try:
docs = load_documents(file_path)
split_docs = DocumentSplitter.auto_split(docs, chunk_size, chunk_overlap)
- return split_docs
+ return _filter_docs(split_docs, chunk_size)
except Exception as e:
logger.error(f"Error loading or splitting file {file_path}: {e}")
raise
diff --git a/runtime/datamate-python/app/module/shared/util/model_chat.py b/runtime/datamate-python/app/module/shared/util/model_chat.py
index bac586da..f47b5f1f 100644
--- a/runtime/datamate-python/app/module/shared/util/model_chat.py
+++ b/runtime/datamate-python/app/module/shared/util/model_chat.py
@@ -24,18 +24,41 @@ def extract_json_substring(raw: str) -> str:
- 再从后向前找最后一个 '}' 或 ']' 作为结束;
- 如果找不到合适的边界,就退回原始字符串。
- 部分模型可能会在回复中加入 `...` 内部思考内容,应在解析前先去除。
+ - 也有模型会在 JSON 前后增加如 ...、... 等标签,本方法会一并去除。
该方法不会保证截取的一定是合法 JSON,但能显著提高 json.loads 的成功率。
"""
if not raw:
return raw
- # 先移除所有 ... 段落(包括跨多行的情况)
try:
import re
- raw = re.sub(r"[\s\S]*?", "", raw, flags=re.IGNORECASE)
+ # 1. 先把所有完整的思考标签块整体去掉:... 等
+ thought_tags = [
+ "think",
+ "thinking",
+ "analysis",
+ "reasoning",
+ "reflection",
+ "inner_thoughts",
+ ]
+ for tag in thought_tags:
+ pattern = rf"<{tag}>[\s\S]*?{tag}>"
+ raw = re.sub(pattern, "", raw, flags=re.IGNORECASE)
+
+ # 2. 再做一次“截取最后一个 (或其它思考标签结束)之后的内容”的兜底
+ # 这样就算标签不成对或嵌套异常,也能保留尾部真正的回答
+ last_pos = -1
+ for tag in thought_tags:
+ # 匹配类似 或
+ m = list(re.finditer(rf"{tag}>", raw, flags=re.IGNORECASE))
+ if m:
+ last_pos = max(last_pos, m[-1].end())
+ if last_pos != -1 and last_pos < len(raw):
+ raw = raw[last_pos:]
+
except Exception:
- # 正则异常时不影响后续逻辑,继续使用原始文本
+ # 正则异常时不影响后续逻辑,继续使用当前文本
pass
start = None