Skip to content

Commit 7f81956

Browse files
authored
Develop labeling module (#25)
* refactor: remove db table management from LS adapter (mv to scripts later); change adapter to use the same MySQL DB as other modules. * refactor: Rename LS Adapter module to datamate-python
1 parent 46dfb38 commit 7f81956

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

69 files changed

+1104
-703
lines changed
Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,6 @@ LS_TASK_PAGE_SIZE=1000
5353
# =========================
5454
# Data Management 服务配置
5555
# =========================
56-
# DM 服务地址
57-
DM_SERVICE_BASE_URL=http://data-engine:8080
58-
5956
# DM 存储文件夹前缀(通常与 Label Studio 的 local-files 文件夹映射一致)
6057
DM_FILE_PATH_PREFIX=/
6158

runtime/datamate-python/README.md

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# Label Studio Adapter (DataMate)
2+
3+
这是 DataMate 的 Label Studio Adapter 服务,负责将 DataMate 的项目与 Label Studio 同步并提供对外的 HTTP API(基于 FastAPI)。
4+
5+
## 简要说明
6+
7+
- 框架:FastAPI
8+
- 异步数据库/ORM:SQLAlchemy (async)
9+
- 数据库迁移:Alembic
10+
- 运行器:uvicorn
11+
12+
## 快速开始(开发)
13+
14+
1. 克隆仓库并进入项目目录
15+
2. 创建并激活虚拟环境:
16+
17+
```bash
18+
python -m venv .venv
19+
source .venv/bin/activate
20+
```
21+
22+
3. 安装依赖:
23+
24+
```bash
25+
pip install -r requirements.txt
26+
```
27+
28+
4. 准备环境变量(示例)
29+
30+
创建 `.env` 并设置必要的变量,例如:
31+
32+
- DATABASE_URL(或根据项目配置使用具体变量)
33+
- LABEL_STUDIO_BASE_URL
34+
- LABEL_STUDIO_USER_TOKEN
35+
36+
(具体变量请参考 `.env.example`
37+
38+
5. 数据库迁移(开发环境):
39+
40+
```bash
41+
alembic upgrade head
42+
```
43+
44+
6. 启动开发服务器(示例与常用参数):
45+
46+
- 本地开发(默认 host/port,自动重载):
47+
48+
```bash
49+
uvicorn app.main:app --reload
50+
```
51+
52+
- 指定主机与端口并打开调试日志:
53+
54+
```bash
55+
uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload --log-level debug
56+
```
57+
58+
- 在生产环境使用多个 worker(不使用 --reload):
59+
60+
```bash
61+
uvicorn app.main:app --host 0.0.0.0 --port 8000 --workers 4 --log-level info --proxy-headers
62+
```
63+
64+
- 使用环境变量启动(示例):
65+
66+
```bash
67+
HOST=0.0.0.0 PORT=8000 uvicorn app.main:app --reload
68+
```
69+
70+
注意:
71+
72+
- `--reload` 仅用于开发,会监视文件变化并重启进程;不要在生产中使用。
73+
- `--workers` 提供并发处理能力,但会增加内存占用;生产时通常配合进程管理或容器编排(Kubernetes)使用。
74+
- 若需要完整的生产部署建议使用 ASGI 服务器(如 gunicorn + uvicorn workers / 或直接使用 uvicorn 在容器中配合进程管理)。
75+
76+
访问 API 文档:
77+
78+
- Swagger UI: http://127.0.0.1:8000/docs
79+
- ReDoc: http://127.0.0.1:8000/redoc (推荐使用)
80+
81+
## 使用(简要)
82+
83+
- 所有 API 路径以 `/api` 前缀注册(见 `app/main.py``app.include_router(api_router, prefix="/api")`)。
84+
- 根路径 `/` 返回服务信息和文档链接。
85+
86+
更多细节请查看 `doc/usage.md`(接口使用)和 `doc/development.md`(开发说明)。
File renamed without changes.
File renamed without changes.

runtime/label-studio-adapter/app/api/project/__init__.py renamed to runtime/datamate-python/app/api/project/__init__.py

File renamed without changes.

runtime/label-studio-adapter/app/api/project/create.py renamed to runtime/datamate-python/app/api/project/create.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from app.db.database import get_db
66
from app.services.dataset_mapping_service import DatasetMappingService
7-
from app.clients import get_clients
7+
from app.infrastructure import DatamateClient, LabelStudioClient
88
from app.schemas.dataset_mapping import (
99
DatasetMappingCreateRequest,
1010
DatasetMappingCreateResponse,
@@ -30,18 +30,19 @@ async def create_dataset_mapping(
3030
注意:一个数据集可以创建多个标注项目
3131
"""
3232
try:
33-
# 获取全局客户端实例
34-
dm_client_instance, ls_client_instance = get_clients()
33+
dm_client = DatamateClient(db)
34+
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
35+
token=settings.label_studio_user_token)
3536
service = DatasetMappingService(db)
3637

37-
logger.info(f"Create dataset mapping request: {request.source_dataset_id}")
38+
logger.info(f"Create dataset mapping request: {request.dataset_id}")
3839

3940
# 从DM服务获取数据集信息
40-
dataset_info = await dm_client_instance.get_dataset(request.source_dataset_id)
41+
dataset_info = await dm_client.get_dataset(request.dataset_id)
4142
if not dataset_info:
4243
raise HTTPException(
4344
status_code=404,
44-
detail=f"Dataset not found in DM service: {request.source_dataset_id}"
45+
detail=f"Dataset not found in DM service: {request.dataset_id}"
4546
)
4647

4748
# 确定数据类型(基于数据集类型)
@@ -55,11 +56,10 @@ async def create_dataset_mapping(
5556
elif "text" in type_code:
5657
data_type = "text"
5758

58-
# 生成项目名称
5959
project_name = f"{dataset_info.name}"
6060

6161
# 在Label Studio中创建项目
62-
project_data = await ls_client_instance.create_project(
62+
project_data = await ls_client.create_project(
6363
title=project_name,
6464
description=dataset_info.description or f"Imported from DM dataset {dataset_info.id}",
6565
data_type=data_type
@@ -74,8 +74,8 @@ async def create_dataset_mapping(
7474
project_id = project_data["id"]
7575

7676
# 配置本地存储:dataset/<id>
77-
local_storage_path = f"{settings.label_studio_local_storage_dataset_base_path}/{request.source_dataset_id}"
78-
storage_result = await ls_client_instance.create_local_storage(
77+
local_storage_path = f"{settings.label_studio_local_storage_dataset_base_path}/{request.dataset_id}"
78+
storage_result = await ls_client.create_local_storage(
7979
project_id=project_id,
8080
path=local_storage_path,
8181
title="Dataset_BLOB",
@@ -85,7 +85,7 @@ async def create_dataset_mapping(
8585

8686
# 配置本地存储:upload
8787
local_storage_path = f"{settings.label_studio_local_storage_upload_base_path}"
88-
storage_result = await ls_client_instance.create_local_storage(
88+
storage_result = await ls_client.create_local_storage(
8989
project_id=project_id,
9090
path=local_storage_path,
9191
title="Upload_BLOB",
@@ -107,7 +107,7 @@ async def create_dataset_mapping(
107107
)
108108

109109
logger.debug(
110-
f"Dataset mapping created: {mapping.mapping_id} -> S {mapping.source_dataset_id} <> L {mapping.labelling_project_id}"
110+
f"Dataset mapping created: {mapping.mapping_id} -> S {mapping.dataset_id} <> L {mapping.labelling_project_id}"
111111
)
112112

113113
response_data = DatasetMappingCreateResponse(

runtime/label-studio-adapter/app/api/project/delete.py renamed to runtime/datamate-python/app/api/project/delete.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
1-
from fastapi import APIRouter, Depends, HTTPException, Query
1+
from fastapi import Depends, HTTPException, Query
22
from sqlalchemy.ext.asyncio import AsyncSession
33
from typing import Optional
44

55
from app.db.database import get_db
66
from app.services.dataset_mapping_service import DatasetMappingService
7-
from app.clients import get_clients
7+
from app.infrastructure import DatamateClient, LabelStudioClient
88
from app.schemas.dataset_mapping import DeleteDatasetResponse
99
from app.schemas import StandardResponse
1010
from app.core.logging import get_logger
11+
from app.core.config import settings
12+
1113
from . import project_router
1214

1315
logger = get_logger(__name__)
@@ -37,39 +39,39 @@ async def delete_mapping(
3739
status_code=400,
3840
detail="Either 'm' (mapping UUID) or 'proj' (project ID) must be provided"
3941
)
40-
41-
# 获取全局客户端实例
42-
dm_client_instance, ls_client_instance = get_clients()
42+
43+
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
44+
token=settings.label_studio_user_token)
4345
service = DatasetMappingService(db)
4446

45-
mapping = None
46-
4747
# 优先使用 mapping_id 查询
4848
if m:
49-
logger.info(f"Deleting by mapping UUID: {m}")
49+
logger.debug(f"Deleting by mapping UUID: {m}")
5050
mapping = await service.get_mapping_by_uuid(m)
5151
# 如果没有提供 m,使用 proj 查询
5252
elif proj:
53-
logger.info(f"Deleting by project ID: {proj}")
53+
logger.debug(f"Deleting by project ID: {proj}")
5454
mapping = await service.get_mapping_by_labelling_project_id(proj)
55+
else:
56+
mapping = None
5557

5658
if not mapping:
5759
raise HTTPException(
5860
status_code=404,
59-
detail=f"Mapping not found"
61+
detail=f"Mapping either not found or not specified."
6062
)
6163

6264
mapping_id = mapping.mapping_id
6365
labelling_project_id = mapping.labelling_project_id
6466
labelling_project_name = mapping.labelling_project_name
6567

66-
logger.info(f"Found mapping: {mapping_id}, Label Studio project ID: {labelling_project_id}")
68+
logger.debug(f"Found mapping: {mapping_id}, Label Studio project ID: {labelling_project_id}")
6769

6870
# 1. 删除 Label Studio 项目
6971
try:
70-
delete_success = await ls_client_instance.delete_project(int(labelling_project_id))
72+
delete_success = await ls_client.delete_project(int(labelling_project_id))
7173
if delete_success:
72-
logger.info(f"Successfully deleted Label Studio project: {labelling_project_id}")
74+
logger.debug(f"Successfully deleted Label Studio project: {labelling_project_id}")
7375
else:
7476
logger.warning(f"Failed to delete Label Studio project or project not found: {labelling_project_id}")
7577
except Exception as e:
@@ -84,19 +86,17 @@ async def delete_mapping(
8486
status_code=500,
8587
detail="Failed to delete mapping record"
8688
)
87-
88-
logger.info(f"Successfully deleted mapping: {mapping_id}")
89-
90-
response_data = DeleteDatasetResponse(
91-
mapping_id=mapping_id,
92-
status="success",
93-
message=f"Successfully deleted mapping and Label Studio project '{labelling_project_name}'"
94-
)
95-
89+
90+
logger.info(f"Successfully deleted mapping: {mapping_id}, Label Studio project: {labelling_project_id}")
91+
9692
return StandardResponse(
9793
code=200,
9894
message="success",
99-
data=response_data
95+
data=DeleteDatasetResponse(
96+
mapping_id=mapping_id,
97+
status="success",
98+
message=f"Successfully deleted mapping and Label Studio project '{labelling_project_name}'"
99+
)
100100
)
101101

102102
except HTTPException:

runtime/label-studio-adapter/app/api/project/list.py renamed to runtime/datamate-python/app/api/project/list.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,9 @@ async def get_mapping(
9898
raise HTTPException(status_code=500, detail="Internal server error")
9999

100100

101-
@project_router.get("/mappings/by-source/{source_dataset_id}", response_model=StandardResponse[PaginatedData[DatasetMappingResponse]])
101+
@project_router.get("/mappings/by-source/{dataset_id}", response_model=StandardResponse[PaginatedData[DatasetMappingResponse]])
102102
async def get_mappings_by_source(
103-
source_dataset_id: str,
103+
dataset_id: str,
104104
page: int = Query(1, ge=1, description="页码(从1开始)"),
105105
page_size: int = Query(20, ge=1, le=100, description="每页记录数"),
106106
db: AsyncSession = Depends(get_db)
@@ -116,11 +116,11 @@ async def get_mappings_by_source(
116116
# 计算 skip
117117
skip = (page - 1) * page_size
118118

119-
logger.info(f"Get mappings by source dataset id: {source_dataset_id}, page={page}, page_size={page_size}")
119+
logger.info(f"Get mappings by source dataset id: {dataset_id}, page={page}, page_size={page_size}")
120120

121121
# 获取数据和总数
122122
mappings, total = await service.get_mappings_by_source_with_count(
123-
source_dataset_id=source_dataset_id,
123+
dataset_id=dataset_id,
124124
skip=skip,
125125
limit=page_size
126126
)

runtime/label-studio-adapter/app/api/project/sync.py renamed to runtime/datamate-python/app/api/project/sync.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from app.db.database import get_db
66
from app.services.dataset_mapping_service import DatasetMappingService
77
from app.services.sync_service import SyncService
8-
from app.clients import get_clients
8+
from app.infrastructure import DatamateClient, LabelStudioClient
99
from app.exceptions import NoDatasetInfoFoundError, DatasetMappingNotFoundError
1010
from app.schemas.dataset_mapping import (
1111
DatasetMappingResponse,
@@ -14,6 +14,7 @@
1414
)
1515
from app.schemas import StandardResponse
1616
from app.core.logging import get_logger
17+
from app.core.config import settings
1718
from . import project_router
1819

1920
logger = get_logger(__name__)
@@ -30,10 +31,12 @@ async def sync_dataset_content(
3031
在数据库中记录更新时间,返回更新状态
3132
"""
3233
try:
33-
dm_client_instance, ls_client_instance = get_clients()
34+
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
35+
token=settings.label_studio_user_token)
36+
dm_client = DatamateClient(db)
3437
mapping_service = DatasetMappingService(db)
35-
sync_service = SyncService(dm_client_instance, ls_client_instance, mapping_service)
36-
38+
sync_service = SyncService(dm_client, ls_client, mapping_service)
39+
3740
logger.info(f"Sync dataset content request: mapping_id={request.mapping_id}")
3841

3942
# 根据 mapping_id 获取映射关系

0 commit comments

Comments
 (0)