Skip to content

Commit e0e9b1d

Browse files
authored
feat:问题生成过程优化及COT数据生成优化 (#169)
* fix(chart): update Helm chart helpers and values for improved configuration * feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths * feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthDataDetail): add delete action for chunks with confirmation prompt * feat(SynthDataDetail): update edit and delete buttons to icon-only format * feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion * feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection * feat(DataSynthesis): refactor data synthesis models and update task handling logic * feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic * feat(DataSynthesis): refactor data synthesis models and update task handling logic * fix(generation_service): ensure processed chunks are incremented regardless of question generation success * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options
1 parent 761f7f6 commit e0e9b1d

File tree

14 files changed

+1364
-573
lines changed

14 files changed

+1364
-573
lines changed

frontend/src/pages/SynthesisTask/CreateTask.tsx

Lines changed: 444 additions & 116 deletions
Large diffs are not rendered by default.

runtime/datamate-python/app/common/text_split.py

Whitespace-only changes.

runtime/datamate-python/app/db/models/data_synthesis.py

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,95 @@
11
import uuid
2-
from xml.etree.ElementTree import tostring
32

4-
from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, ForeignKey, func
5-
from sqlalchemy.orm import relationship
3+
from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, func
64

75
from app.db.session import Base
86
from app.module.generation.schema.generation import CreateSynthesisTaskRequest
97

108

119
async def save_synthesis_task(db_session, synthesis_task: CreateSynthesisTaskRequest):
12-
"""保存数据合成任务。"""
13-
# 转换为模型实例
10+
"""保存数据合成任务。
11+
12+
注意:当前 MySQL 表 `t_data_synth_instances` 结构中只包含 synth_type / synth_config 等字段,
13+
没有 model_id、text_split_config、source_file_id、result_data_location 等列,因此这里只保存
14+
与表结构一致的字段,其他信息由上层逻辑或其它表负责管理。
15+
"""
1416
gid = str(uuid.uuid4())
15-
synthesis_task_instance = DataSynthesisInstance(
17+
18+
# 兼容旧请求结构:从请求对象中提取必要字段,
19+
# - 合成类型:synthesis_type -> synth_type
20+
# - 合成配置:text_split_config + synthesis_config 合并后写入 synth_config
21+
22+
synth_task_instance = DataSynthInstance(
1623
id=gid,
1724
name=synthesis_task.name,
1825
description=synthesis_task.description,
1926
status="pending",
20-
model_id=synthesis_task.model_id,
21-
synthesis_type=synthesis_task.synthesis_type.value,
27+
synth_type=synthesis_task.synthesis_type.value,
2228
progress=0,
23-
result_data_location=f"/dataset/synthesis_results/{gid}/",
24-
text_split_config=synthesis_task.text_split_config.model_dump(),
25-
synthesis_config=synthesis_task.synthesis_config.model_dump(),
26-
source_file_id=synthesis_task.source_file_id,
27-
total_files=len(synthesis_task.source_file_id),
29+
synth_config=synthesis_task.synth_config.model_dump(),
30+
total_files=len(synthesis_task.source_file_id or []),
2831
processed_files=0,
2932
total_chunks=0,
3033
processed_chunks=0,
31-
total_synthesis_data=0,
34+
total_synth_data=0,
3235
created_at=func.now(),
3336
updated_at=func.now(),
3437
created_by="system",
35-
updated_by="system"
38+
updated_by="system",
3639
)
37-
db_session.add(synthesis_task_instance)
40+
db_session.add(synth_task_instance)
3841
await db_session.commit()
39-
await db_session.refresh(synthesis_task_instance)
40-
return synthesis_task_instance
42+
await db_session.refresh(synth_task_instance)
43+
return synth_task_instance
4144

4245

43-
class DataSynthesisInstance(Base):
44-
"""数据合成任务表,对应表 t_data_synthesis_instances
46+
class DataSynthInstance(Base):
47+
"""数据合成任务表,对应表 t_data_synth_instances
4548
46-
create table if not exists t_data_synthesis_instances
49+
create table if not exists t_data_synth_instances
4750
(
4851
id VARCHAR(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci PRIMARY KEY COMMENT 'UUID',
4952
name VARCHAR(255) NOT NULL COMMENT '任务名称',
5053
description TEXT COMMENT '任务描述',
5154
status VARCHAR(20) COMMENT '任务状态',
52-
synthesis_type VARCHAR(20) NOT NULL COMMENT '合成类型',
53-
model_id VARCHAR(255) NOT NULL COMMENT '模型ID',
55+
synth_type VARCHAR(20) NOT NULL COMMENT '合成类型',
5456
progress INT DEFAULT 0 COMMENT '任务进度(百分比)',
55-
result_data_location VARCHAR(1000) COMMENT '结果数据存储位置',
56-
text_split_config JSON NOT NULL COMMENT '文本切片配置',
57-
synthesis_config JSON NOT NULL COMMENT '合成配置',
58-
source_file_id JSON NOT NULL COMMENT '原始文件ID列表',
57+
synth_config JSON NOT NULL COMMENT '合成配置',
5958
total_files INT DEFAULT 0 COMMENT '总文件数',
6059
processed_files INT DEFAULT 0 COMMENT '已处理文件数',
6160
total_chunks INT DEFAULT 0 COMMENT '总文本块数',
6261
processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
63-
total_synthesis_data INT DEFAULT 0 COMMENT '总合成数据量',
62+
total_synth_data INT DEFAULT 0 COMMENT '总合成数据量',
6463
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
6564
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
6665
created_by VARCHAR(255) COMMENT '创建者',
6766
updated_by VARCHAR(255) COMMENT '更新者'
6867
) COMMENT='数据合成任务表(UUID 主键)';
6968
"""
7069

71-
__tablename__ = "t_data_synthesis_instances"
70+
__tablename__ = "t_data_synth_instances"
7271

7372
id = Column(String(36), primary_key=True, index=True, comment="UUID")
7473
name = Column(String(255), nullable=False, comment="任务名称")
7574
description = Column(Text, nullable=True, comment="任务描述")
7675
status = Column(String(20), nullable=True, comment="任务状态")
77-
synthesis_type = Column(String(20), nullable=False, comment="合成类型")
78-
model_id = Column(String(255), nullable=False, comment="模型ID")
76+
# 与数据库字段保持一致:synth_type / synth_config
77+
synth_type = Column(String(20), nullable=False, comment="合成类型")
7978
progress = Column(Integer, nullable=False, default=0, comment="任务进度(百分比)")
80-
result_data_location = Column(String(1000), nullable=True, comment="结果数据存储位置")
81-
text_split_config = Column(JSON, nullable=False, comment="文本切片配置")
82-
synthesis_config = Column(JSON, nullable=False, comment="合成配置")
83-
source_file_id = Column(JSON, nullable=False, comment="原始文件ID列表")
79+
synth_config = Column(JSON, nullable=False, comment="合成配置")
8480
total_files = Column(Integer, nullable=False, default=0, comment="总文件数")
8581
processed_files = Column(Integer, nullable=False, default=0, comment="已处理文件数")
8682
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
8783
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
88-
total_synthesis_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
89-
90-
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间")
91-
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), nullable=True, comment="更新时间")
84+
total_synth_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
85+
created_at = Column(TIMESTAMP, nullable=False, default=func.now(), comment="创建时间")
86+
updated_at = Column(
87+
TIMESTAMP,
88+
nullable=False,
89+
default=func.now(),
90+
onupdate=func.now(),
91+
comment="更新时间",
92+
)
9293
created_by = Column(String(255), nullable=True, comment="创建者")
9394
updated_by = Column(String(255), nullable=True, comment="更新者")
9495

@@ -123,7 +124,7 @@ class DataSynthesisFileInstance(Base):
123124
)
124125
file_name = Column(String(255), nullable=False, comment="文件名")
125126
source_file_id = Column(String(255), nullable=False, comment="原始文件ID")
126-
target_file_location = Column(String(1000), nullable=False, comment="目标文件存储位置")
127+
target_file_location = Column(String(1000), nullable=True, comment="目标文件存储位置")
127128
status = Column(String(20), nullable=True, comment="任务状态")
128129
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
129130
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")

runtime/datamate-python/app/module/evaluation/service/evaluation.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from app.db.session import AsyncSessionLocal
1414
from app.module.evaluation.schema.evaluation import SourceType
1515
from app.module.shared.schema import TaskStatus
16-
from app.module.shared.util.model_chat import call_openai_style_model, _extract_json_substring
16+
from app.module.shared.util.model_chat import call_openai_style_model, extract_json_substring
1717
from app.module.evaluation.schema.prompt import get_prompt
1818
from app.module.shared.util.structured_file import StructuredFileHandlerFactory
1919
from app.module.system.service.common_service import get_model_by_id
@@ -36,8 +36,8 @@ def get_eval_prompt(self, item: EvaluationItem) -> str:
3636
.replace("{question}", eval_content.get("instruction")))
3737
.replace("{answer}", eval_content.get("output")))
3838
if self.task.task_type == "COT":
39-
prompt_text = ((prompt_text.replace("{question}", eval_content.get("question"))
40-
.replace("{conclusion}", eval_content.get("conclusion")))
39+
prompt_text = ((prompt_text.replace("{question}", eval_content.get("instruction"))
40+
.replace("{conclusion}", eval_content.get("output")))
4141
.replace("{chain_of_thought}", eval_content.get("chain_of_thought")))
4242
return prompt_text
4343

@@ -73,7 +73,7 @@ async def evaluate_item(self, model_config, item: EvaluationItem, semaphore: asy
7373
call_openai_style_model, model_config.base_url, model_config.api_key, model_config.model_name,
7474
prompt_text,
7575
)
76-
resp_text = _extract_json_substring(resp_text)
76+
resp_text = extract_json_substring(resp_text)
7777
try:
7878
json.loads(resp_text)
7979
except Exception as e:

0 commit comments

Comments
 (0)