Skip to content

Commit 8b164cb

Browse files
authored
feat: Implement data synthesis task management with database models and API endpoints (#122)
1 parent 458afa2 commit 8b164cb

File tree

21 files changed

+1378
-21
lines changed

21 files changed

+1378
-21
lines changed

runtime/datamate-python/README.md

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,26 @@ python -m venv .venv
1919
source .venv/bin/activate
2020
```
2121

22-
3. 安装依赖:
22+
3. 安装依赖
23+
由于项目使用poetry管理依赖,你可以使用以下命令安装:
2324

2425
```bash
25-
pip install -r requirements.txt
26+
pip install poetry
27+
poetry install
2628
```
29+
或者直接使用pip安装(如果poetry不可用):
2730

28-
4. 准备环境变量(示例)
29-
30-
创建 `.env` 并设置必要的变量,例如:
31+
```bash
32+
pip install -e .
33+
```
3134

32-
- DATABASE_URL(或根据项目配置使用具体变量)
33-
- LABEL_STUDIO_BASE_URL
34-
- LABEL_STUDIO_USER_TOKEN
35+
4. 配置环境变量
36+
复制环境变量示例文件并配置:
3537

36-
(具体变量请参考 `.env.example`
38+
```bash
39+
cp .env.example .env
40+
```
41+
编辑.env文件,设置必要的环境变量,如数据库连接、Label Studio配置等。
3742

3843
5. 数据库迁移(开发环境):
3944

runtime/datamate-python/app/__init__.py

Whitespace-only changes.

runtime/datamate-python/app/core/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class Config:
1717

1818
host: str = "0.0.0.0"
1919
port: int = 18000
20-
20+
2121
# CORS
2222
# allowed_origins: List[str] = ["*"]
2323
# allowed_methods: List[str] = ["*"]
@@ -36,7 +36,7 @@ class Config:
3636
mysql_database: str = "datamate"
3737

3838
database_url: str = "" # Will be overridden by build_database_url() if not provided
39-
39+
4040
@model_validator(mode='after')
4141
def build_database_url(self):
4242
"""如果没有提供 database_url,则根据 MySQL 配置构建"""
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
import uuid
2+
from xml.etree.ElementTree import tostring
3+
4+
from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, ForeignKey, func
5+
from sqlalchemy.orm import relationship
6+
7+
from app.db.session import Base
8+
from app.module.generation.schema.generation import CreateSynthesisTaskRequest
9+
10+
11+
async def save_synthesis_task(db_session, synthesis_task: CreateSynthesisTaskRequest):
12+
"""保存数据合成任务。"""
13+
# 转换为模型实例
14+
gid = str(uuid.uuid4())
15+
synthesis_task_instance = DataSynthesisInstance(
16+
id=gid,
17+
name=synthesis_task.name,
18+
description=synthesis_task.description,
19+
status="pending",
20+
model_id=synthesis_task.model_id,
21+
synthesis_type=synthesis_task.synthesis_type.value,
22+
progress=0,
23+
result_data_location=f"/dataset/synthesis_results/{gid}/",
24+
text_split_config=synthesis_task.text_split_config.model_dump(),
25+
synthesis_config=synthesis_task.synthesis_config.model_dump(),
26+
source_file_id=synthesis_task.source_file_id,
27+
total_files=len(synthesis_task.source_file_id),
28+
processed_files=0,
29+
total_chunks=0,
30+
processed_chunks=0,
31+
total_synthesis_data=0,
32+
created_at=func.now(),
33+
updated_at=func.now(),
34+
created_by="system",
35+
updated_by="system"
36+
)
37+
db_session.add(synthesis_task_instance)
38+
await db_session.commit()
39+
await db_session.refresh(synthesis_task_instance)
40+
return synthesis_task_instance
41+
42+
43+
class DataSynthesisInstance(Base):
44+
"""数据合成任务表,对应表 t_data_synthesis_instances
45+
46+
create table if not exists t_data_synthesis_instances
47+
(
48+
id VARCHAR(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci PRIMARY KEY COMMENT 'UUID',
49+
name VARCHAR(255) NOT NULL COMMENT '任务名称',
50+
description TEXT COMMENT '任务描述',
51+
status VARCHAR(20) COMMENT '任务状态',
52+
synthesis_type VARCHAR(20) NOT NULL COMMENT '合成类型',
53+
model_id VARCHAR(255) NOT NULL COMMENT '模型ID',
54+
progress INT DEFAULT 0 COMMENT '任务进度(百分比)',
55+
result_data_location VARCHAR(1000) COMMENT '结果数据存储位置',
56+
text_split_config JSON NOT NULL COMMENT '文本切片配置',
57+
synthesis_config JSON NOT NULL COMMENT '合成配置',
58+
source_file_id JSON NOT NULL COMMENT '原始文件ID列表',
59+
total_files INT DEFAULT 0 COMMENT '总文件数',
60+
processed_files INT DEFAULT 0 COMMENT '已处理文件数',
61+
total_chunks INT DEFAULT 0 COMMENT '总文本块数',
62+
processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
63+
total_synthesis_data INT DEFAULT 0 COMMENT '总合成数据量',
64+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
65+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
66+
created_by VARCHAR(255) COMMENT '创建者',
67+
updated_by VARCHAR(255) COMMENT '更新者'
68+
) COMMENT='数据合成任务表(UUID 主键)';
69+
"""
70+
71+
__tablename__ = "t_data_synthesis_instances"
72+
73+
id = Column(String(36), primary_key=True, index=True, comment="UUID")
74+
name = Column(String(255), nullable=False, comment="任务名称")
75+
description = Column(Text, nullable=True, comment="任务描述")
76+
status = Column(String(20), nullable=True, comment="任务状态")
77+
synthesis_type = Column(String(20), nullable=False, comment="合成类型")
78+
model_id = Column(String(255), nullable=False, comment="模型ID")
79+
progress = Column(Integer, nullable=False, default=0, comment="任务进度(百分比)")
80+
result_data_location = Column(String(1000), nullable=True, comment="结果数据存储位置")
81+
text_split_config = Column(JSON, nullable=False, comment="文本切片配置")
82+
synthesis_config = Column(JSON, nullable=False, comment="合成配置")
83+
source_file_id = Column(JSON, nullable=False, comment="原始文件ID列表")
84+
total_files = Column(Integer, nullable=False, default=0, comment="总文件数")
85+
processed_files = Column(Integer, nullable=False, default=0, comment="已处理文件数")
86+
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
87+
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
88+
total_synthesis_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
89+
90+
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间")
91+
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), nullable=True, comment="更新时间")
92+
created_by = Column(String(255), nullable=True, comment="创建者")
93+
updated_by = Column(String(255), nullable=True, comment="更新者")
94+
95+
96+
class DataSynthesisFileInstance(Base):
97+
"""数据合成文件任务表,对应表 t_data_synthesis_file_instances
98+
99+
create table if not exists t_data_synthesis_file_instances (
100+
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
101+
synthesis_instance_id VARCHAR(36) COMMENT '数据合成任务ID',
102+
file_name VARCHAR(255) NOT NULL COMMENT '文件名',
103+
source_file_id VARCHAR(255) NOT NULL COMMENT '原始文件ID',
104+
target_file_location VARCHAR(1000) NOT NULL COMMENT '目标文件存储位置',
105+
status VARCHAR(20) COMMENT '任务状态',
106+
total_chunks INT DEFAULT 0 COMMENT '总文本块数',
107+
processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
108+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
109+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
110+
created_by VARCHAR(255) COMMENT '创建者',
111+
updated_by VARCHAR(255) COMMENT '更新者'
112+
) COMMENT='数据合成文件任务表(UUID 主键)';
113+
"""
114+
115+
__tablename__ = "t_data_synthesis_file_instances"
116+
117+
id = Column(String(36), primary_key=True, index=True, comment="UUID")
118+
synthesis_instance_id = Column(
119+
String(36),
120+
nullable=False,
121+
comment="数据合成任务ID",
122+
index=True,
123+
)
124+
file_name = Column(String(255), nullable=False, comment="文件名")
125+
source_file_id = Column(String(255), nullable=False, comment="原始文件ID")
126+
target_file_location = Column(String(1000), nullable=False, comment="目标文件存储位置")
127+
status = Column(String(20), nullable=True, comment="任务状态")
128+
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
129+
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
130+
131+
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间")
132+
updated_at = Column(
133+
TIMESTAMP,
134+
server_default=func.current_timestamp(),
135+
onupdate=func.current_timestamp(),
136+
nullable=True,
137+
comment="更新时间",
138+
)
139+
created_by = Column(String(255), nullable=True, comment="创建者")
140+
updated_by = Column(String(255), nullable=True, comment="更新者")
141+
142+
143+
class DataSynthesisChunkInstance(Base):
144+
"""数据合成分块任务表,对应表 t_data_synthesis_chunk_instances
145+
146+
create table if not exists t_data_synthesis_chunk_instances (
147+
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
148+
synthesis_file_instance_id VARCHAR(36) COMMENT '数据合成文件任务ID',
149+
chunk_index INT COMMENT '分块索引',
150+
chunk_content TEXT COMMENT '分块内容',
151+
metadata JSON COMMENT '分块元数据'
152+
) COMMENT='数据合成分块任务表(UUID 主键)';
153+
"""
154+
155+
__tablename__ = "t_data_synthesis_chunk_instances"
156+
157+
id = Column(String(36), primary_key=True, index=True, comment="UUID")
158+
synthesis_file_instance_id = Column(
159+
String(36),
160+
nullable=False,
161+
comment="数据合成文件任务ID",
162+
index=True,
163+
)
164+
chunk_index = Column(Integer, nullable=True, comment="分块索引")
165+
chunk_content = Column(Text, nullable=True, comment="分块内容")
166+
# SQLAlchemy Declarative 保留了属性名 'metadata',这里使用 chunk_metadata 作为属性名,
167+
# 底层列名仍为 'metadata' 以保持与表结构兼容。
168+
chunk_metadata = Column("metadata", JSON, nullable=True, comment="分块元数据")
169+
170+
171+
class SynthesisData(Base):
172+
"""数据合成结果表,对应表 t_synthesis_data
173+
174+
create table if not exists t_synthesis_data (
175+
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
176+
data json COMMENT '合成的数据',
177+
synthesis_file_instance_id VARCHAR(36) COMMENT '数据合成文件任务ID',
178+
chunk_instance_id VARCHAR(36) COMMENT '分块任务ID'
179+
) COMMENT='数据合成任务队列表(UUID 主键)';
180+
"""
181+
182+
__tablename__ = "t_data_synthesis_data"
183+
184+
id = Column(String(36), primary_key=True, index=True, comment="UUID")
185+
data = Column(JSON, nullable=True, comment="合成的数据")
186+
synthesis_file_instance_id = Column(
187+
String(36),
188+
nullable=False,
189+
comment="数据合成文件任务ID",
190+
index=True,
191+
)
192+
chunk_instance_id = Column(
193+
String(36),
194+
nullable=False,
195+
comment="分块任务ID",
196+
index=True,
197+
)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from sqlalchemy import Column, String, Integer, TIMESTAMP, select
2+
3+
from app.db.session import Base
4+
5+
6+
async def get_model_by_id(db_session, model_id: str):
7+
"""根据 ID 获取单个模型配置。"""
8+
result =await db_session.execute(select(ModelConfig).where(ModelConfig.id == model_id))
9+
model_config = result.scalar_one_or_none()
10+
return model_config
11+
12+
class ModelConfig(Base):
13+
"""模型配置表,对应表 t_model_config
14+
15+
CREATE TABLE IF NOT EXISTS t_model_config (
16+
id VARCHAR(36) PRIMARY KEY COMMENT '主键ID',
17+
model_name VARCHAR(100) NOT NULL COMMENT '模型名称(如 qwen2)',
18+
provider VARCHAR(50) NOT NULL COMMENT '模型提供商(如 Ollama、OpenAI、DeepSeek)',
19+
base_url VARCHAR(255) NOT NULL COMMENT 'API 基础地址',
20+
api_key VARCHAR(512) DEFAULT '' COMMENT 'API 密钥(无密钥则为空)',
21+
type VARCHAR(50) NOT NULL COMMENT '模型类型(如 chat、embedding)',
22+
is_enabled TINYINT DEFAULT 1 COMMENT '是否启用:1-启用,0-禁用',
23+
is_default TINYINT DEFAULT 0 COMMENT '是否默认:1-默认,0-非默认',
24+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
25+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
26+
created_by VARCHAR(255) COMMENT '创建者',
27+
updated_by VARCHAR(255) COMMENT '更新者',
28+
UNIQUE KEY uk_model_provider (model_name, provider)
29+
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4 COMMENT ='模型配置表';
30+
"""
31+
32+
__tablename__ = "t_model_config"
33+
34+
id = Column(String(36), primary_key=True, index=True, comment="主键ID")
35+
model_name = Column(String(100), nullable=False, comment="模型名称(如 qwen2)")
36+
provider = Column(String(50), nullable=False, comment="模型提供商(如 Ollama、OpenAI、DeepSeek)")
37+
base_url = Column(String(255), nullable=False, comment="API 基础地址")
38+
api_key = Column(String(512), nullable=False, default="", comment="API 密钥(无密钥则为空)")
39+
type = Column(String(50), nullable=False, comment="模型类型(如 chat、embedding)")
40+
41+
# 使用 Integer 存储 TINYINT,后续可在业务层将 0/1 转为 bool
42+
is_enabled = Column(Integer, nullable=False, default=1, comment="是否启用:1-启用,0-禁用")
43+
is_default = Column(Integer, nullable=False, default=0, comment="是否默认:1-默认,0-非默认")
44+
45+
created_at = Column(TIMESTAMP, nullable=True, comment="创建时间")
46+
updated_at = Column(TIMESTAMP, nullable=True, comment="更新时间")
47+
created_by = Column(String(255), nullable=True, comment="创建者")
48+
updated_by = Column(String(255), nullable=True, comment="更新者")
49+
50+
__table_args__ = (
51+
# 与 DDL 中的 uk_model_provider 保持一致
52+
{
53+
"mysql_engine": "InnoDB",
54+
"mysql_charset": "utf8mb4",
55+
"comment": "模型配置表",
56+
},
57+
)

runtime/datamate-python/app/db/session.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515

1616
# 创建会话工厂
1717
AsyncSessionLocal = async_sessionmaker(
18-
engine,
19-
class_=AsyncSession,
18+
engine,
19+
class_=AsyncSession,
2020
expire_on_commit=False
2121
)
2222

@@ -29,4 +29,3 @@ async def get_db() -> AsyncGenerator[AsyncSession, None]:
2929
yield session
3030
finally:
3131
await session.close()
32-

runtime/datamate-python/app/main.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
@asynccontextmanager
2626
async def lifespan(app: FastAPI):
27-
27+
2828
# @startup
2929
logger.info("DataMate Python Backend starting...")
3030

@@ -43,7 +43,7 @@ async def lifespan(app: FastAPI):
4343
logger.info(f"Label Studio: {settings.label_studio_base_url}")
4444

4545
yield
46-
46+
4747
# @shutdown
4848
logger.info("DataMate Python Backend shutting down ...\n\n")
4949

@@ -105,11 +105,11 @@ async def root():
105105

106106
if __name__ == "__main__":
107107
import uvicorn
108-
108+
109109
uvicorn.run(
110110
"app.main:app",
111111
host=settings.host,
112112
port=settings.port,
113113
reload=settings.debug,
114114
log_level=settings.log_level.lower()
115-
)
115+
)

runtime/datamate-python/app/module/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from .system.interface import router as system_router
44
from .annotation.interface import router as annotation_router
55
from .synthesis.interface import router as ratio_router
6+
from .generation.interface import router as generation_router
67

78
router = APIRouter(
89
prefix="/api"
@@ -11,5 +12,6 @@
1112
router.include_router(system_router)
1213
router.include_router(annotation_router)
1314
router.include_router(ratio_router)
15+
router.include_router(generation_router)
1416

1517
__all__ = ["router"]

runtime/datamate-python/app/module/generation/__init__.py

Whitespace-only changes.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from fastapi import APIRouter
2+
3+
router = APIRouter(
4+
prefix="/synth",
5+
tags = ["synth"]
6+
)
7+
8+
# Include sub-routers
9+
from .generation_api import router as generation_router_router
10+
11+
router.include_router(generation_router_router)

0 commit comments

Comments
 (0)