Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
a6c3712
fix(chart): update Helm chart helpers and values for improved configu…
Dallas98 Dec 11, 2025
175cdb9
feat(SynthesisTaskTab): enhance task table with tooltip support and i…
Dallas98 Dec 11, 2025
5970775
feat(CreateTask, SynthFileTask): improve task creation and detail vie…
Dallas98 Dec 11, 2025
f92dd59
feat(SynthFileTask): enhance file display with progress tracking and …
Dallas98 Dec 11, 2025
01e0301
feat(SynthFileTask): enhance file display with progress tracking and …
Dallas98 Dec 11, 2025
a207659
Merge branch 'main' into dev
Dallas98 Dec 11, 2025
951b065
feat(SynthDataDetail): add delete action for chunks with confirmation…
Dallas98 Dec 11, 2025
a7463ca
feat(SynthDataDetail): update edit and delete buttons to icon-only fo…
Dallas98 Dec 11, 2025
61230fa
feat(SynthDataDetail): add confirmation modals for chunk and synthesi…
Dallas98 Dec 11, 2025
937ae72
Merge branch 'refs/heads/main' into dev
Dallas98 Dec 13, 2025
4aaf0fd
feat(DocumentSplitter): add enhanced document splitting functionality…
Dallas98 Dec 13, 2025
2ec96d9
feat(DataSynthesis): refactor data synthesis models and update task h…
Dallas98 Dec 13, 2025
efc32df
feat(DataSynthesis): streamline synthesis task handling and enhance c…
Dallas98 Dec 15, 2025
02ee5f1
Merge branch 'main' into dev
Dallas98 Dec 16, 2025
b58d561
Merge branch 'main' into dev
Dallas98 Dec 17, 2025
59f8319
feat(DataSynthesis): refactor data synthesis models and update task h…
Dallas98 Dec 17, 2025
9775425
Merge branch 'main' into dev
Dallas98 Dec 17, 2025
81d0ed8
fix(generation_service): ensure processed chunks are incremented rega…
Dallas98 Dec 17, 2025
3bcc48c
feat(CreateTask): enhance task creation with new synthesis templates …
Dallas98 Dec 18, 2025
a847317
feat(CreateTask): enhance task creation with new synthesis templates …
Dallas98 Dec 18, 2025
401ae45
feat(CreateTask): enhance task creation with new synthesis templates …
Dallas98 Dec 18, 2025
97f8fb6
feat(CreateTask): enhance task creation with new synthesis templates …
Dallas98 Dec 18, 2025
45f68a3
feat(model_chat): enhance JSON parsing by removing additional thought…
Dallas98 Dec 18, 2025
98ce106
Merge branch 'main' into dev
Dallas98 Dec 18, 2025
1fcf318
feat(generation_service): add document filtering to remove short docu…
Dallas98 Dec 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@
from app.module.system.service.common_service import chat, get_model_by_id, get_chat_client


def _filter_docs(split_docs, chunk_size):
"""
过滤文档,移除长度小于 chunk_size 的文档
"""
filtered_docs = []
for doc in split_docs:
if len(doc.page_content) >= chunk_size * 0.7:
filtered_docs.append(doc)
return filtered_docs


class GenerationService:
def __init__(self, db: AsyncSession):
self.db = db
Expand Down Expand Up @@ -464,7 +475,7 @@ def _load_and_split(file_path: str, chunk_size: int, chunk_overlap: int):
try:
docs = load_documents(file_path)
split_docs = DocumentSplitter.auto_split(docs, chunk_size, chunk_overlap)
return split_docs
return _filter_docs(split_docs, chunk_size)
except Exception as e:
logger.error(f"Error loading or splitting file {file_path}: {e}")
raise
Expand Down
29 changes: 26 additions & 3 deletions runtime/datamate-python/app/module/shared/util/model_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,41 @@ def extract_json_substring(raw: str) -> str:
- 再从后向前找最后一个 '}' 或 ']' 作为结束;
- 如果找不到合适的边界,就退回原始字符串。
- 部分模型可能会在回复中加入 `<think>...</think>` 内部思考内容,应在解析前先去除。
- 也有模型会在 JSON 前后增加如 <reasoning>...</reasoning>、<analysis>...</analysis> 等标签,本方法会一并去除。
该方法不会保证截取的一定是合法 JSON,但能显著提高 json.loads 的成功率。
"""
if not raw:
return raw

# 先移除所有 <think>...</think> 段落(包括跨多行的情况)
try:
import re

raw = re.sub(r"<think>[\s\S]*?</think>", "", raw, flags=re.IGNORECASE)
# 1. 先把所有完整的思考标签块整体去掉:<think>...</think> 等
thought_tags = [
"think",
"thinking",
"analysis",
"reasoning",
"reflection",
"inner_thoughts",
]
for tag in thought_tags:
pattern = rf"<{tag}>[\s\S]*?</{tag}>"
raw = re.sub(pattern, "", raw, flags=re.IGNORECASE)

# 2. 再做一次“截取最后一个 </think>(或其它思考标签结束)之后的内容”的兜底
# 这样就算标签不成对或嵌套异常,也能保留尾部真正的回答
last_pos = -1
for tag in thought_tags:
# 匹配类似 </think> 或 </THINK>
m = list(re.finditer(rf"</{tag}>", raw, flags=re.IGNORECASE))
if m:
last_pos = max(last_pos, m[-1].end())
if last_pos != -1 and last_pos < len(raw):
raw = raw[last_pos:]

except Exception:
# 正则异常时不影响后续逻辑,继续使用原始文本
# 正则异常时不影响后续逻辑,继续使用当前文本
pass

start = None
Expand Down