ModelEngine-Group
diff --git a/‎runtime/datamate-python/app/db/models/data_synthesis.py‎
Lines changed: 50 additions & 39 deletions b/‎runtime/datamate-python/app/db/models/data_synthesis.py‎
Lines changed: 50 additions & 39 deletions
diff --git a/‎runtime/datamate-python/app/module/evaluation/service/evaluation.py‎
Lines changed: 2 additions & 2 deletions b/‎runtime/datamate-python/app/module/evaluation/service/evaluation.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎runtime/datamate-python/app/module/generation/interface/generation_api.py‎
Lines changed: 80 additions & 39 deletions b/‎runtime/datamate-python/app/module/generation/interface/generation_api.py‎
Lines changed: 80 additions & 39 deletions
@@ -1,94 +1,105 @@
 import uuid
-from xml.etree.ElementTree import tostring
 
-from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, ForeignKey, func
-from sqlalchemy.orm import relationship
+from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, func
 
 from app.db.session import Base
 from app.module.generation.schema.generation import CreateSynthesisTaskRequest
 
 
 async def save_synthesis_task(db_session, synthesis_task: CreateSynthesisTaskRequest):
-    """保存数据合成任务。"""
-    # 转换为模型实例
+    """保存数据合成任务。
+
+    注意：当前 MySQL 表 `t_data_synth_instances` 结构中只包含 synth_type / synth_config 等字段，
+    没有 model_id、text_split_config、source_file_id、result_data_location 等列，因此这里只保存
+    与表结构一致的字段，其他信息由上层逻辑或其它表负责管理。
+    """
     gid = str(uuid.uuid4())
-    synthesis_task_instance = DataSynthesisInstance(
+
+    # 兼容旧请求结构：从请求对象中提取必要字段，
+    #   - 合成类型：synthesis_type -> synth_type
+    #   - 合成配置：text_split_config + synthesis_config 合并后写入 synth_config
+    synth_config = {
+        "text_split_config": synthesis_task.text_split_config.model_dump()
+        if synthesis_task.text_split_config
+        else None,
+        "synthesis_config": synthesis_task.synthesis_config.model_dump()
+        if synthesis_task.synthesis_config
+        else None,
+        "model_id": synthesis_task.model_id,
+        "source_file_id": list(synthesis_task.source_file_id or []),
+    }
+
+    synth_task_instance = DataSynthInstance(
         id=gid,
         name=synthesis_task.name,
         description=synthesis_task.description,
         status="pending",
-        model_id=synthesis_task.model_id,
-        synthesis_type=synthesis_task.synthesis_type.value,
+        synth_type=synthesis_task.synthesis_type.value,
         progress=0,
-        result_data_location=f"/dataset/synthesis_results/{gid}/",
-        text_split_config=synthesis_task.text_split_config.model_dump(),
-        synthesis_config=synthesis_task.synthesis_config.model_dump(),
-        source_file_id=synthesis_task.source_file_id,
-        total_files=len(synthesis_task.source_file_id),
+        synth_config=synth_config,
+        total_files=len(synthesis_task.source_file_id or []),
         processed_files=0,
         total_chunks=0,
         processed_chunks=0,
-        total_synthesis_data=0,
+        total_synth_data=0,
         created_at=func.now(),
         updated_at=func.now(),
         created_by="system",
-        updated_by="system"
+        updated_by="system",
     )
-    db_session.add(synthesis_task_instance)
+    db_session.add(synth_task_instance)
     await db_session.commit()
-    await db_session.refresh(synthesis_task_instance)
-    return synthesis_task_instance
+    await db_session.refresh(synth_task_instance)
+    return synth_task_instance
 
 
-class DataSynthesisInstance(Base):
-    """数据合成任务表，对应表 t_data_synthesis_instances
+class DataSynthInstance(Base):
+    """数据合成任务表，对应表 t_data_synth_instances
 
-    create table if not exists t_data_synthesis_instances
+    create table if not exists t_data_synth_instances
     (
         id VARCHAR(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci PRIMARY KEY COMMENT 'UUID',
         name VARCHAR(255) NOT NULL COMMENT '任务名称',
         description TEXT COMMENT '任务描述',
         status VARCHAR(20) COMMENT '任务状态',
-        synthesis_type VARCHAR(20) NOT NULL COMMENT '合成类型',
-        model_id VARCHAR(255) NOT NULL COMMENT '模型ID',
+        synth_type VARCHAR(20) NOT NULL COMMENT '合成类型',
         progress INT DEFAULT 0 COMMENT '任务进度(百分比)',
-        result_data_location VARCHAR(1000) COMMENT '结果数据存储位置',
-        text_split_config JSON NOT NULL COMMENT '文本切片配置',
-        synthesis_config JSON NOT NULL COMMENT '合成配置',
-        source_file_id JSON NOT NULL COMMENT '原始文件ID列表',
+        synth_config JSON NOT NULL COMMENT '合成配置',
         total_files INT DEFAULT 0 COMMENT '总文件数',
         processed_files INT DEFAULT 0 COMMENT '已处理文件数',
         total_chunks INT DEFAULT 0 COMMENT '总文本块数',
         processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
-        total_synthesis_data INT DEFAULT 0 COMMENT '总合成数据量',
+        total_synth_data INT DEFAULT 0 COMMENT '总合成数据量',
         created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
         updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
         created_by VARCHAR(255) COMMENT '创建者',
         updated_by VARCHAR(255) COMMENT '更新者'
     ) COMMENT='数据合成任务表（UUID 主键）';
     """
 
-    __tablename__ = "t_data_synthesis_instances"
+    __tablename__ = "t_data_synth_instances"
 
     id = Column(String(36), primary_key=True, index=True, comment="UUID")
     name = Column(String(255), nullable=False, comment="任务名称")
     description = Column(Text, nullable=True, comment="任务描述")
     status = Column(String(20), nullable=True, comment="任务状态")
-    synthesis_type = Column(String(20), nullable=False, comment="合成类型")
-    model_id = Column(String(255), nullable=False, comment="模型ID")
+    # 与数据库字段保持一致：synth_type / synth_config
+    synth_type = Column(String(20), nullable=False, comment="合成类型")
     progress = Column(Integer, nullable=False, default=0, comment="任务进度(百分比)")
-    result_data_location = Column(String(1000), nullable=True, comment="结果数据存储位置")
-    text_split_config = Column(JSON, nullable=False, comment="文本切片配置")
-    synthesis_config = Column(JSON, nullable=False, comment="合成配置")
-    source_file_id = Column(JSON, nullable=False, comment="原始文件ID列表")
+    synth_config = Column(JSON, nullable=False, comment="合成配置")
     total_files = Column(Integer, nullable=False, default=0, comment="总文件数")
     processed_files = Column(Integer, nullable=False, default=0, comment="已处理文件数")
     total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
     processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
-    total_synthesis_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
-
-    created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间")
-    updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), nullable=True, comment="更新时间")
+    total_synth_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
+    created_at = Column(TIMESTAMP, nullable=False, default=func.now(), comment="创建时间")
+    updated_at = Column(
+        TIMESTAMP,
+        nullable=False,
+        default=func.now(),
+        onupdate=func.now(),
+        comment="更新时间",
+    )
     created_by = Column(String(255), nullable=True, comment="创建者")
     updated_by = Column(String(255), nullable=True, comment="更新者")
 
 
@@ -13,7 +13,7 @@
 from app.db.session import AsyncSessionLocal
 from app.module.evaluation.schema.evaluation import SourceType
 from app.module.shared.schema import TaskStatus
-from app.module.shared.util.model_chat import call_openai_style_model, _extract_json_substring
+from app.module.shared.util.model_chat import call_openai_style_model, extract_json_substring
 from app.module.evaluation.schema.prompt import get_prompt
 from app.module.shared.util.structured_file import StructuredFileHandlerFactory
 from app.module.system.service.common_service import get_model_by_id
@@ -73,7 +73,7 @@ async def evaluate_item(self, model_config, item: EvaluationItem, semaphore: asy
                     call_openai_style_model, model_config.base_url, model_config.api_key, model_config.model_name,
                     prompt_text,
                 )
-                resp_text = _extract_json_substring(resp_text)
+                resp_text = extract_json_substring(resp_text)
                 try:
                     json.loads(resp_text)
                 except Exception as e:
 
@@ -1,4 +1,5 @@
 import uuid
+from typing import cast
 
 from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
 from sqlalchemy import select, func, delete
@@ -7,7 +8,7 @@
 from app.core.logging import get_logger
 from app.db.models.data_synthesis import (
     save_synthesis_task,
-    DataSynthesisInstance,
+    DataSynthInstance,
     DataSynthesisFileInstance,
     DataSynthesisChunkInstance,
     SynthesisData,
@@ -65,32 +66,64 @@ async def create_synthesis_task(
     synthesis_task = await save_synthesis_task(db, request)
 
     # 将已有的 DatasetFiles 记录保存到 t_data_synthesis_file_instances
+    synth_files = []
     for f in dataset_files:
         file_instance = DataSynthesisFileInstance(
             id=str(uuid.uuid4()),  # 使用新的 UUID 作为文件任务记录的主键，避免与 DatasetFiles 主键冲突
             synthesis_instance_id=synthesis_task.id,
             file_name=f.file_name,
             source_file_id=str(f.id),
-            target_file_location=synthesis_task.result_data_location or "",
             status="pending",
             total_chunks=0,
             processed_chunks=0,
             created_by="system",
             updated_by="system",
         )
-        db.add(file_instance)
+        synth_files.append(file_instance)
 
     if dataset_files:
+        db.add_all(synth_files)
         await db.commit()
 
     generation_service = GenerationService(db)
     # 异步处理任务：只传任务 ID，后台任务中使用新的 DB 会话重新加载任务对象
     background_tasks.add_task(generation_service.process_task, synthesis_task.id)
 
+    # 将 ORM 对象包装成 DataSynthesisTaskItem，兼容新字段从 synth_config 还原
+    synth_cfg = getattr(synthesis_task, "synth_config", {}) or {}
+    text_split_cfg = synth_cfg.get("text_split_config") or {}
+    synthesis_cfg = synth_cfg.get("synthesis_config") or {}
+    source_file_ids = synth_cfg.get("source_file_id") or request.source_file_id or []
+    model_id = synth_cfg.get("model_id") or request.model_id
+    result_location = synth_cfg.get("result_data_location")
+
+    task_item = DataSynthesisTaskItem(
+        id=synthesis_task.id,
+        name=synthesis_task.name,
+        description=synthesis_task.description,
+        status=synthesis_task.status,
+        synthesis_type=synthesis_task.synth_type,
+        model_id=model_id,
+        progress=synthesis_task.progress,
+        result_data_location=result_location,
+        text_split_config=text_split_cfg,
+        synthesis_config=synthesis_cfg,
+        source_file_id=list(source_file_ids),
+        total_files=synthesis_task.total_files,
+        processed_files=synthesis_task.processed_files,
+        total_chunks=synthesis_task.total_chunks,
+        processed_chunks=synthesis_task.processed_chunks,
+        total_synthesis_data=synthesis_task.total_synth_data,
+        created_at=synthesis_task.created_at,
+        updated_at=synthesis_task.updated_at,
+        created_by=synthesis_task.created_by,
+        updated_by=synthesis_task.updated_by,
+    )
+
     return StandardResponse(
         code=200,
         message="success",
-        data=synthesis_task,
+        data=task_item,
     )
 
 
@@ -100,7 +133,7 @@ async def get_synthesis_task(
     db: AsyncSession = Depends(get_db)
 ):
     """获取数据合成任务详情"""
-    result = await db.get(DataSynthesisInstance, task_id)
+    result = await db.get(DataSynthInstance, task_id)
     if not result:
         raise HTTPException(status_code=404, detail="Synthesis task not found")
 
@@ -121,16 +154,16 @@ async def list_synthesis_tasks(
     db: AsyncSession = Depends(get_db)
 ):
     """分页列出所有数据合成任务，默认按创建时间倒序"""
-    query = select(DataSynthesisInstance)
+    query = select(DataSynthInstance)
     if synthesis_type:
-        query = query.filter(DataSynthesisInstance.synthesis_type == synthesis_type)
+        query = query.filter(DataSynthInstance.synth_type == synthesis_type)
     if status:
-        query = query.filter(DataSynthesisInstance.status == status)
+        query = query.filter(DataSynthInstance.status == status)
     if name:
-        query = query.filter(DataSynthesisInstance.name.like(f"%{name}%"))
+        query = query.filter(DataSynthInstance.name.like(f"%{name}%"))
 
     # 默认按创建时间倒序排列
-    query = query.order_by(DataSynthesisInstance.created_at.desc())
+    query = query.order_by(DataSynthInstance.created_at.desc())
 
     count_q = select(func.count()).select_from(query.subquery())
     total = (await db.execute(count_q)).scalar_one()
@@ -143,31 +176,39 @@ async def list_synthesis_tasks(
     result = await db.execute(query.offset((page - 1) * page_size).limit(page_size))
     rows = result.scalars().all()
 
-    task_items = [
-        DataSynthesisTaskItem(
-            id=row.id,
-            name=row.name,
-            description=row.description,
-            status=row.status,
-            synthesis_type=row.synthesis_type,
-            model_id=row.model_id,
-            progress=row.progress,
-            result_data_location=row.result_data_location,
-            text_split_config=row.text_split_config,
-            synthesis_config=row.synthesis_config,
-            source_file_id=row.source_file_id,
-            total_files=row.total_files,
-            processed_files=row.processed_files,
-            total_chunks=row.total_chunks,
-            processed_chunks=row.processed_chunks,
-            total_synthesis_data=row.total_synthesis_data,
-            created_at=row.created_at,
-            updated_at=row.updated_at,
-            created_by=row.created_by,
-            updated_by=row.updated_by,
+    task_items: list[DataSynthesisTaskItem] = []
+    for row in rows:
+        synth_cfg = getattr(row, "synth_config", {}) or {}
+        text_split_cfg = synth_cfg.get("text_split_config") or {}
+        synthesis_cfg = synth_cfg.get("synthesis_config") or {}
+        source_file_ids = synth_cfg.get("source_file_id") or []
+        model_id = synth_cfg.get("model_id")
+        result_location = synth_cfg.get("result_data_location")
+
+        task_items.append(
+            DataSynthesisTaskItem(
+                id=str(row.id),
+                name=str(row.name),
+                description=cast(str | None, row.description),
+                status=cast(str | None, row.status),
+                synthesis_type=str(row.synth_type),
+                model_id=model_id or "",
+                progress=int(cast(int, row.progress)),
+                result_data_location=result_location,
+                text_split_config=text_split_cfg,
+                synthesis_config=synthesis_cfg,
+                source_file_id=list(source_file_ids),
+                total_files=int(cast(int, row.total_files)),
+                processed_files=int(cast(int, row.processed_files)),
+                total_chunks=int(cast(int, row.total_chunks)),
+                processed_chunks=int(cast(int, row.processed_chunks)),
+                total_synthesis_data=int(cast(int, row.total_synth_data)),
+                created_at=row.created_at,
+                updated_at=row.updated_at,
+                created_by=row.created_by,
+                updated_by=row.updated_by,
+            )
         )
-        for row in rows
-    ]
 
     paged = PagedDataSynthesisTaskResponse(
         content=task_items,
@@ -190,7 +231,7 @@ async def delete_synthesis_task(
     db: AsyncSession = Depends(get_db)
 ):
     """删除数据合成任务"""
-    task = await db.get(DataSynthesisInstance, task_id)
+    task = await db.get(DataSynthInstance, task_id)
     if not task:
         raise HTTPException(status_code=404, detail="Synthesis task not found")
 
@@ -241,7 +282,7 @@ async def delete_synthesis_file_task(
 ):
     """删除数据合成任务中的文件任务，同时刷新任务表中的文件/切片数量"""
     # 先获取任务和文件任务记录
-    task = await db.get(DataSynthesisInstance, task_id)
+    task = await db.get(DataSynthInstance, task_id)
     if not task:
         raise HTTPException(status_code=404, detail="Synthesis task not found")
 
@@ -306,7 +347,7 @@ async def list_synthesis_file_tasks(
 ):
     """分页获取某个数据合成任务下的文件任务列表"""
     # 先校验任务是否存在
-    task = await db.get(DataSynthesisInstance, task_id)
+    task = await db.get(DataSynthInstance, task_id)
     if not task:
         raise HTTPException(status_code=404, detail="Synthesis task not found")
 
@@ -523,7 +564,7 @@ async def delete_synthesis_data_by_chunk(
     result = await db.execute(
         delete(SynthesisData).where(SynthesisData.chunk_instance_id == chunk_id)
     )
-    deleted = result.rowcount or 0
+    deleted = int(getattr(result, "rowcount", 0) or 0)
 
     await db.commit()
 
@@ -542,7 +583,7 @@ async def batch_delete_synthesis_data(
     result = await db.execute(
         delete(SynthesisData).where(SynthesisData.id.in_(request.ids))
     )
-    deleted = result.rowcount or 0
+    deleted = int(getattr(result, "rowcount", 0) or 0)
     await db.commit()
 
     return StandardResponse(code=200, message="success", data=deleted)