|
| 1 | +import uuid |
| 2 | +from xml.etree.ElementTree import tostring |
| 3 | + |
| 4 | +from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, ForeignKey, func |
| 5 | +from sqlalchemy.orm import relationship |
| 6 | + |
| 7 | +from app.db.session import Base |
| 8 | +from app.module.generation.schema.generation import CreateSynthesisTaskRequest |
| 9 | + |
| 10 | + |
| 11 | +async def save_synthesis_task(db_session, synthesis_task: CreateSynthesisTaskRequest): |
| 12 | + """保存数据合成任务。""" |
| 13 | + # 转换为模型实例 |
| 14 | + gid = str(uuid.uuid4()) |
| 15 | + synthesis_task_instance = DataSynthesisInstance( |
| 16 | + id=gid, |
| 17 | + name=synthesis_task.name, |
| 18 | + description=synthesis_task.description, |
| 19 | + status="pending", |
| 20 | + model_id=synthesis_task.model_id, |
| 21 | + synthesis_type=synthesis_task.synthesis_type.value, |
| 22 | + progress=0, |
| 23 | + result_data_location=f"/dataset/synthesis_results/{gid}/", |
| 24 | + text_split_config=synthesis_task.text_split_config.model_dump(), |
| 25 | + synthesis_config=synthesis_task.synthesis_config.model_dump(), |
| 26 | + source_file_id=synthesis_task.source_file_id, |
| 27 | + total_files=len(synthesis_task.source_file_id), |
| 28 | + processed_files=0, |
| 29 | + total_chunks=0, |
| 30 | + processed_chunks=0, |
| 31 | + total_synthesis_data=0, |
| 32 | + created_at=func.now(), |
| 33 | + updated_at=func.now(), |
| 34 | + created_by="system", |
| 35 | + updated_by="system" |
| 36 | + ) |
| 37 | + db_session.add(synthesis_task_instance) |
| 38 | + await db_session.commit() |
| 39 | + await db_session.refresh(synthesis_task_instance) |
| 40 | + return synthesis_task_instance |
| 41 | + |
| 42 | + |
| 43 | +class DataSynthesisInstance(Base): |
| 44 | + """数据合成任务表,对应表 t_data_synthesis_instances |
| 45 | +
|
| 46 | + create table if not exists t_data_synthesis_instances |
| 47 | + ( |
| 48 | + id VARCHAR(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci PRIMARY KEY COMMENT 'UUID', |
| 49 | + name VARCHAR(255) NOT NULL COMMENT '任务名称', |
| 50 | + description TEXT COMMENT '任务描述', |
| 51 | + status VARCHAR(20) COMMENT '任务状态', |
| 52 | + synthesis_type VARCHAR(20) NOT NULL COMMENT '合成类型', |
| 53 | + model_id VARCHAR(255) NOT NULL COMMENT '模型ID', |
| 54 | + progress INT DEFAULT 0 COMMENT '任务进度(百分比)', |
| 55 | + result_data_location VARCHAR(1000) COMMENT '结果数据存储位置', |
| 56 | + text_split_config JSON NOT NULL COMMENT '文本切片配置', |
| 57 | + synthesis_config JSON NOT NULL COMMENT '合成配置', |
| 58 | + source_file_id JSON NOT NULL COMMENT '原始文件ID列表', |
| 59 | + total_files INT DEFAULT 0 COMMENT '总文件数', |
| 60 | + processed_files INT DEFAULT 0 COMMENT '已处理文件数', |
| 61 | + total_chunks INT DEFAULT 0 COMMENT '总文本块数', |
| 62 | + processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数', |
| 63 | + total_synthesis_data INT DEFAULT 0 COMMENT '总合成数据量', |
| 64 | + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', |
| 65 | + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', |
| 66 | + created_by VARCHAR(255) COMMENT '创建者', |
| 67 | + updated_by VARCHAR(255) COMMENT '更新者' |
| 68 | + ) COMMENT='数据合成任务表(UUID 主键)'; |
| 69 | + """ |
| 70 | + |
| 71 | + __tablename__ = "t_data_synthesis_instances" |
| 72 | + |
| 73 | + id = Column(String(36), primary_key=True, index=True, comment="UUID") |
| 74 | + name = Column(String(255), nullable=False, comment="任务名称") |
| 75 | + description = Column(Text, nullable=True, comment="任务描述") |
| 76 | + status = Column(String(20), nullable=True, comment="任务状态") |
| 77 | + synthesis_type = Column(String(20), nullable=False, comment="合成类型") |
| 78 | + model_id = Column(String(255), nullable=False, comment="模型ID") |
| 79 | + progress = Column(Integer, nullable=False, default=0, comment="任务进度(百分比)") |
| 80 | + result_data_location = Column(String(1000), nullable=True, comment="结果数据存储位置") |
| 81 | + text_split_config = Column(JSON, nullable=False, comment="文本切片配置") |
| 82 | + synthesis_config = Column(JSON, nullable=False, comment="合成配置") |
| 83 | + source_file_id = Column(JSON, nullable=False, comment="原始文件ID列表") |
| 84 | + total_files = Column(Integer, nullable=False, default=0, comment="总文件数") |
| 85 | + processed_files = Column(Integer, nullable=False, default=0, comment="已处理文件数") |
| 86 | + total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数") |
| 87 | + processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数") |
| 88 | + total_synthesis_data = Column(Integer, nullable=False, default=0, comment="总合成数据量") |
| 89 | + |
| 90 | + created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间") |
| 91 | + updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), nullable=True, comment="更新时间") |
| 92 | + created_by = Column(String(255), nullable=True, comment="创建者") |
| 93 | + updated_by = Column(String(255), nullable=True, comment="更新者") |
| 94 | + |
| 95 | + |
| 96 | +class DataSynthesisFileInstance(Base): |
| 97 | + """数据合成文件任务表,对应表 t_data_synthesis_file_instances |
| 98 | +
|
| 99 | + create table if not exists t_data_synthesis_file_instances ( |
| 100 | + id VARCHAR(36) PRIMARY KEY COMMENT 'UUID', |
| 101 | + synthesis_instance_id VARCHAR(36) COMMENT '数据合成任务ID', |
| 102 | + file_name VARCHAR(255) NOT NULL COMMENT '文件名', |
| 103 | + source_file_id VARCHAR(255) NOT NULL COMMENT '原始文件ID', |
| 104 | + target_file_location VARCHAR(1000) NOT NULL COMMENT '目标文件存储位置', |
| 105 | + status VARCHAR(20) COMMENT '任务状态', |
| 106 | + total_chunks INT DEFAULT 0 COMMENT '总文本块数', |
| 107 | + processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数', |
| 108 | + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', |
| 109 | + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', |
| 110 | + created_by VARCHAR(255) COMMENT '创建者', |
| 111 | + updated_by VARCHAR(255) COMMENT '更新者' |
| 112 | + ) COMMENT='数据合成文件任务表(UUID 主键)'; |
| 113 | + """ |
| 114 | + |
| 115 | + __tablename__ = "t_data_synthesis_file_instances" |
| 116 | + |
| 117 | + id = Column(String(36), primary_key=True, index=True, comment="UUID") |
| 118 | + synthesis_instance_id = Column( |
| 119 | + String(36), |
| 120 | + nullable=False, |
| 121 | + comment="数据合成任务ID", |
| 122 | + index=True, |
| 123 | + ) |
| 124 | + file_name = Column(String(255), nullable=False, comment="文件名") |
| 125 | + source_file_id = Column(String(255), nullable=False, comment="原始文件ID") |
| 126 | + target_file_location = Column(String(1000), nullable=False, comment="目标文件存储位置") |
| 127 | + status = Column(String(20), nullable=True, comment="任务状态") |
| 128 | + total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数") |
| 129 | + processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数") |
| 130 | + |
| 131 | + created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间") |
| 132 | + updated_at = Column( |
| 133 | + TIMESTAMP, |
| 134 | + server_default=func.current_timestamp(), |
| 135 | + onupdate=func.current_timestamp(), |
| 136 | + nullable=True, |
| 137 | + comment="更新时间", |
| 138 | + ) |
| 139 | + created_by = Column(String(255), nullable=True, comment="创建者") |
| 140 | + updated_by = Column(String(255), nullable=True, comment="更新者") |
| 141 | + |
| 142 | + |
| 143 | +class DataSynthesisChunkInstance(Base): |
| 144 | + """数据合成分块任务表,对应表 t_data_synthesis_chunk_instances |
| 145 | +
|
| 146 | + create table if not exists t_data_synthesis_chunk_instances ( |
| 147 | + id VARCHAR(36) PRIMARY KEY COMMENT 'UUID', |
| 148 | + synthesis_file_instance_id VARCHAR(36) COMMENT '数据合成文件任务ID', |
| 149 | + chunk_index INT COMMENT '分块索引', |
| 150 | + chunk_content TEXT COMMENT '分块内容', |
| 151 | + metadata JSON COMMENT '分块元数据' |
| 152 | + ) COMMENT='数据合成分块任务表(UUID 主键)'; |
| 153 | + """ |
| 154 | + |
| 155 | + __tablename__ = "t_data_synthesis_chunk_instances" |
| 156 | + |
| 157 | + id = Column(String(36), primary_key=True, index=True, comment="UUID") |
| 158 | + synthesis_file_instance_id = Column( |
| 159 | + String(36), |
| 160 | + nullable=False, |
| 161 | + comment="数据合成文件任务ID", |
| 162 | + index=True, |
| 163 | + ) |
| 164 | + chunk_index = Column(Integer, nullable=True, comment="分块索引") |
| 165 | + chunk_content = Column(Text, nullable=True, comment="分块内容") |
| 166 | + # SQLAlchemy Declarative 保留了属性名 'metadata',这里使用 chunk_metadata 作为属性名, |
| 167 | + # 底层列名仍为 'metadata' 以保持与表结构兼容。 |
| 168 | + chunk_metadata = Column("metadata", JSON, nullable=True, comment="分块元数据") |
| 169 | + |
| 170 | + |
| 171 | +class SynthesisData(Base): |
| 172 | + """数据合成结果表,对应表 t_synthesis_data |
| 173 | +
|
| 174 | + create table if not exists t_synthesis_data ( |
| 175 | + id VARCHAR(36) PRIMARY KEY COMMENT 'UUID', |
| 176 | + data json COMMENT '合成的数据', |
| 177 | + synthesis_file_instance_id VARCHAR(36) COMMENT '数据合成文件任务ID', |
| 178 | + chunk_instance_id VARCHAR(36) COMMENT '分块任务ID' |
| 179 | + ) COMMENT='数据合成任务队列表(UUID 主键)'; |
| 180 | + """ |
| 181 | + |
| 182 | + __tablename__ = "t_data_synthesis_data" |
| 183 | + |
| 184 | + id = Column(String(36), primary_key=True, index=True, comment="UUID") |
| 185 | + data = Column(JSON, nullable=True, comment="合成的数据") |
| 186 | + synthesis_file_instance_id = Column( |
| 187 | + String(36), |
| 188 | + nullable=False, |
| 189 | + comment="数据合成文件任务ID", |
| 190 | + index=True, |
| 191 | + ) |
| 192 | + chunk_instance_id = Column( |
| 193 | + String(36), |
| 194 | + nullable=False, |
| 195 | + comment="分块任务ID", |
| 196 | + index=True, |
| 197 | + ) |
0 commit comments