Skip to content

Commit 78f50ea

Browse files
authored
feat: File and Annotation 2-way sync implementation (#63)
* feat: Refactor configuration and sync logic for improved dataset handling and logging * feat: Enhance annotation synchronization and dataset file management - Added new fields `tags_updated_at` to `DatasetFiles` model for tracking the last update time of tags. - Implemented new asynchronous methods in the Label Studio client for fetching, creating, updating, and deleting task annotations. - Introduced bidirectional synchronization for annotations between DataMate and Label Studio, allowing for flexible data management. - Updated sync service to handle annotation conflicts based on timestamps, ensuring data integrity during synchronization. - Enhanced dataset file response model to include tags and their update timestamps. - Modified database initialization script to create a new column for `tags_updated_at` in the dataset files table. - Updated requirements to ensure compatibility with the latest dependencies.
1 parent d136bad commit 78f50ea

File tree

16 files changed

+1315
-269
lines changed

16 files changed

+1315
-269
lines changed
Lines changed: 12 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,19 @@
1-
# ====================================
2-
# Label Studio Adapter Configuration
3-
# ====================================
4-
5-
# =========================
6-
# 应用程序配置
7-
# =========================
8-
APP_NAME="Label Studio Adapter"
9-
APP_VERSION="1.0.0"
10-
APP_DESCRIPTION="Adapter for integrating Data Management System with Label Studio"
11-
DEBUG=true
12-
13-
# =========================
14-
# 服务器配置
15-
# =========================
1+
# Dev settings
162
HOST=0.0.0.0
173
PORT=18000
184

19-
# =========================
20-
# 日志配置
21-
# =========================
22-
LOG_LEVEL=INFO
23-
24-
# =========================
25-
# Label Studio 服务配置
26-
# =========================
27-
# Label Studio 服务地址(根据部署方式调整)
28-
# Docker 环境:http://label-studio:8080
29-
# 本地开发:http://127.0.0.1:8000
30-
LABEL_STUDIO_BASE_URL=http://label-studio:8080
31-
32-
# Label Studio 用户名和密码(用于自动创建用户)
33-
LABEL_STUDIO_USERNAME=[email protected]
34-
LABEL_STUDIO_PASSWORD=password
35-
36-
# Label Studio API 认证 Token(Legacy Token,推荐使用)
37-
# 从 Label Studio UI 的 Account & Settings > Access Token 获取
38-
LABEL_STUDIO_USER_TOKEN=your-label-studio-token-here
39-
40-
# Label Studio 本地文件存储基础路径(容器内路径,用于 Docker 部署时的权限检查)
41-
LABEL_STUDIO_LOCAL_BASE=/label-studio/local_files
42-
43-
# Label Studio 本地文件服务路径前缀(任务数据中的文件路径前缀)
44-
LABEL_STUDIO_FILE_PATH_PREFIX=/data/local-files/?d=
45-
46-
# Label Studio 容器中的本地存储路径(用于配置 Local Storage)
47-
LABEL_STUDIO_LOCAL_STORAGE_DATASET_BASE_PATH=/label-studio/local_files/dataset
48-
LABEL_STUDIO_LOCAL_STORAGE_UPLOAD_BASE_PATH=/label-studio/local_files/upload
49-
50-
# Label Studio 任务列表分页大小
51-
LS_TASK_PAGE_SIZE=1000
52-
53-
# =========================
54-
# Data Management 服务配置
55-
# =========================
56-
# DM 存储文件夹前缀(通常与 Label Studio 的 local-files 文件夹映射一致)
57-
DM_FILE_PATH_PREFIX=/
5+
DEBUG=true
6+
LOG_LEVEL=DEBUG
7+
LOG_FILE_DIR=./logs
588

59-
# =========================
60-
# Adapter 数据库配置 (MySQL)
61-
# =========================
62-
# 优先级1:如果配置了 MySQL,将优先使用 MySQL 数据库
63-
MYSQL_HOST=adapter-db
9+
# DataBase
10+
MYSQL_HOST=localhost
6411
MYSQL_PORT=3306
65-
MYSQL_USER=label_studio_user
66-
MYSQL_PASSWORD=user_password
67-
MYSQL_DATABASE=label_studio_adapter
68-
69-
# =========================
70-
# CORS 配置
71-
# =========================
72-
# 允许的来源(生产环境建议配置具体域名)
73-
ALLOWED_ORIGINS=["*"]
74-
75-
# 允许的 HTTP 方法
76-
ALLOWED_METHODS=["*"]
77-
78-
# 允许的请求头
79-
ALLOWED_HEADERS=["*"]
80-
81-
# =========================
82-
# Docker Compose 配置
83-
# =========================
84-
# Docker Compose 项目名称前缀
85-
COMPOSE_PROJECT_NAME=ls-adapter
12+
MYSQL_USER=root
13+
MYSQL_PASSWORD=password
14+
MYSQL_DATABASE=datamate
8615

87-
# =========================
88-
# 同步配置(未来扩展)
89-
# =========================
90-
# 批量同步任务的批次大小
91-
SYNC_BATCH_SIZE=100
16+
# Label Studio settings
17+
LABEL_STUDIO_BASE_URL=http://localhost:8080
9218

93-
# 同步失败时的最大重试次数
94-
MAX_RETRIES=3
19+
LABEL_STUDIO_USER_TOKEN="demo_dev_token"

runtime/datamate-python/.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@
33
.dev.env
44

55
# logs
6-
logs/
6+
logs/
7+
8+
doc/

runtime/datamate-python/app/core/config.py

Lines changed: 20 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,41 @@
11
from pydantic_settings import BaseSettings
22
from pydantic import model_validator
3-
from typing import Optional, List
4-
import os
5-
from pathlib import Path
3+
from typing import Optional
64

75
class Settings(BaseSettings):
86
"""应用程序配置"""
97

108
class Config:
119
env_file = ".env"
1210
case_sensitive = False
13-
extra = 'ignore' # 允许额外字段(如 Shell 脚本专用的环境变量)
11+
extra = 'ignore'
1412

15-
# =========================
16-
# Adapter 服务配置
17-
# =========================
18-
app_name: str = "Label Studio Adapter"
13+
# Service
14+
app_name: str = "DataMate Python Backend"
1915
app_version: str = "1.0.0"
2016
app_description: str = "Adapter for integrating Data Management System with Label Studio"
2117

22-
# 日志配置
23-
log_level: str = "INFO"
24-
debug: bool = True
25-
log_file_dir: str = "/var/log/datamate"
26-
27-
# 服务器配置
2818
host: str = "0.0.0.0"
29-
port: int = 8000
30-
31-
# CORS配置
19+
port: int = 18000
20+
21+
# CORS
3222
# allowed_origins: List[str] = ["*"]
3323
# allowed_methods: List[str] = ["*"]
3424
# allowed_headers: List[str] = ["*"]
3525

36-
# MySQL数据库配置 (优先级1)
26+
# Log
27+
log_level: str = "INFO"
28+
debug: bool = True
29+
log_file_dir: str = "/var/log/datamate"
30+
31+
# Database
3732
mysql_host: str = "datamate-database"
3833
mysql_port: int = 3306
3934
mysql_user: str = "root"
4035
mysql_password: str = "password"
4136
mysql_database: str = "datamate"
4237

43-
# 直接数据库URL配置(如果提供,将覆盖上述配置)
44-
# 初始值为空字符串,在 model_validator 中会被设置为完整的 URL
45-
database_url: str = ""
38+
database_url: str = "" # Will be overridden by build_database_url() if not provided
4639

4740
@model_validator(mode='after')
4841
def build_database_url(self):
@@ -55,22 +48,18 @@ def build_database_url(self):
5548
return self
5649

5750

58-
# =========================
59-
# Label Studio 服务配置
60-
# =========================
51+
# Label Studio
6152
label_studio_base_url: str = "http://label-studio:8000"
62-
label_studio_username: Optional[str] = "[email protected]" # Label Studio 用户名(用于登录)
63-
label_studio_password: Optional[str] = "demoadmin" # Label Studio 密码(用于登录)
53+
label_studio_username: Optional[str] = "[email protected]"
54+
label_studio_password: Optional[str] = "demoadmin"
6455
label_studio_user_token: Optional[str] = "abc123abc123" # Legacy Token
6556

66-
label_studio_local_storage_dataset_base_path: str = "/label-studio/local" # Label Studio容器中的本地存储基础路径
67-
label_studio_file_path_prefix: str = "/data/local-files/?d=" # Label Studio本地文件服务路径前缀
57+
label_studio_local_document_root: str = "/label-studio/local" # Label Studio local file storage path
58+
label_studio_file_path_prefix: str = "/data/local-files/?d=" # Label Studio local file serving URL prefix
6859

6960
ls_task_page_size: int = 1000
7061

71-
# =========================
72-
# Data Management 服务配置
73-
# =========================
62+
# DataMate
7463
dm_file_path_prefix: str = "/dataset" # DM存储文件夹前缀
7564

7665
# 全局设置实例

runtime/datamate-python/app/db/models/dataset_management.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class DatasetFiles(Base):
6464
file_size = Column(BigInteger, default=0, comment="文件大小(字节)")
6565
check_sum = Column(String(64), nullable=True, comment="文件校验和")
6666
tags = Column(JSON, nullable=True, comment="文件标签信息")
67+
tags_updated_at = Column(TIMESTAMP, nullable=True, comment="标签最后更新时间")
6768
dataset_filemetadata = Column("metadata", JSON, nullable=True, comment="文件元数据")
6869
status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/DELETED/PROCESSING")
6970
upload_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传时间")

runtime/datamate-python/app/main.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ async def lifespan(app: FastAPI):
4545
yield
4646

4747
# @shutdown
48-
logger.info("DataMate Python Backend shutting down ...")
48+
logger.info("DataMate Python Backend shutting down ...\n\n")
4949

5050
# 创建FastAPI应用
5151
app = FastAPI(
@@ -69,11 +69,7 @@ async def lifespan(app: FastAPI):
6969
app.include_router(router)
7070

7171
# 输出注册的路由(每行一个)
72-
logger.debug("Registered routes:")
73-
for route in app.routes:
74-
route_path = getattr(route, "path", None)
75-
if route_path:
76-
logger.debug(f" {route_path}")
72+
logger.debug(f"Registered routes refer to http://localhost:{settings.port}/redoc")
7773

7874
# 注册全局异常处理器
7975
app.add_exception_handler(StarletteHTTPException, starlette_http_exception_handler) # type: ignore
@@ -102,7 +98,7 @@ async def root():
10298
data={
10399
"message": f"{settings.app_name} is running",
104100
"version": settings.app_version,
105-
"docs_url": "/docs",
101+
"docs_url": "/redoc",
106102
"label_studio_url": settings.label_studio_base_url
107103
}
108104
)

runtime/datamate-python/app/module/annotation/client/labelstudio/client.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,148 @@ async def delete_project(self, project_id: int) -> bool:
380380
logger.error(f"Error while deleting project {project_id}: {e}")
381381
return False
382382

383+
async def get_task_annotations(
384+
self,
385+
task_id: int
386+
) -> Optional[List[Dict[str, Any]]]:
387+
"""获取任务的标注结果
388+
389+
Args:
390+
task_id: 任务ID
391+
392+
Returns:
393+
标注结果列表,每个标注包含完整的annotation信息
394+
"""
395+
try:
396+
logger.debug(f"Fetching annotations for task: {task_id}")
397+
398+
response = await self.client.get(f"/api/tasks/{task_id}/annotations")
399+
response.raise_for_status()
400+
401+
annotations = response.json()
402+
logger.debug(f"Fetched {len(annotations)} annotations for task {task_id}")
403+
404+
return annotations
405+
406+
except httpx.HTTPStatusError as e:
407+
logger.error(f"Get task annotations failed HTTP {e.response.status_code}: {e.response.text}")
408+
return None
409+
except Exception as e:
410+
logger.error(f"Error while getting task annotations: {e}")
411+
return None
412+
413+
async def create_annotation(
414+
self,
415+
task_id: int,
416+
result: List[Dict[str, Any]],
417+
completed_by: Optional[int] = None
418+
) -> Optional[Dict[str, Any]]:
419+
"""为任务创建新的标注
420+
421+
Args:
422+
task_id: 任务ID
423+
result: 标注结果列表
424+
completed_by: 完成标注的用户ID(可选)
425+
426+
Returns:
427+
创建的标注信息,失败返回None
428+
"""
429+
try:
430+
logger.debug(f"Creating annotation for task: {task_id}")
431+
432+
annotation_data = {
433+
"result": result,
434+
"task": task_id
435+
}
436+
437+
if completed_by:
438+
annotation_data["completed_by"] = completed_by
439+
440+
response = await self.client.post(
441+
f"/api/tasks/{task_id}/annotations",
442+
json=annotation_data
443+
)
444+
response.raise_for_status()
445+
446+
annotation = response.json()
447+
logger.debug(f"Created annotation {annotation.get('id')} for task {task_id}")
448+
449+
return annotation
450+
451+
except httpx.HTTPStatusError as e:
452+
logger.error(f"Create annotation failed HTTP {e.response.status_code}: {e.response.text}")
453+
return None
454+
except Exception as e:
455+
logger.error(f"Error while creating annotation: {e}")
456+
return None
457+
458+
async def update_annotation(
459+
self,
460+
annotation_id: int,
461+
result: List[Dict[str, Any]]
462+
) -> Optional[Dict[str, Any]]:
463+
"""更新已存在的标注
464+
465+
Args:
466+
annotation_id: 标注ID
467+
result: 新的标注结果列表
468+
469+
Returns:
470+
更新后的标注信息,失败返回None
471+
"""
472+
try:
473+
logger.debug(f"Updating annotation: {annotation_id}")
474+
475+
annotation_data = {
476+
"result": result
477+
}
478+
479+
response = await self.client.patch(
480+
f"/api/annotations/{annotation_id}",
481+
json=annotation_data
482+
)
483+
response.raise_for_status()
484+
485+
annotation = response.json()
486+
logger.debug(f"Updated annotation {annotation_id}")
487+
488+
return annotation
489+
490+
except httpx.HTTPStatusError as e:
491+
logger.error(f"Update annotation failed HTTP {e.response.status_code}: {e.response.text}")
492+
return None
493+
except Exception as e:
494+
logger.error(f"Error while updating annotation: {e}")
495+
return None
496+
497+
async def delete_annotation(
498+
self,
499+
annotation_id: int
500+
) -> bool:
501+
"""删除标注
502+
503+
Args:
504+
annotation_id: 标注ID
505+
506+
Returns:
507+
成功返回True,失败返回False
508+
"""
509+
try:
510+
logger.debug(f"Deleting annotation: {annotation_id}")
511+
512+
response = await self.client.delete(f"/api/annotations/{annotation_id}")
513+
response.raise_for_status()
514+
515+
logger.debug(f"Deleted annotation {annotation_id}")
516+
return True
517+
518+
except httpx.HTTPStatusError as e:
519+
logger.error(f"Delete annotation failed HTTP {e.response.status_code}: {e.response.text}")
520+
return False
521+
except Exception as e:
522+
logger.error(f"Error while deleting annotation: {e}")
523+
return False
524+
383525
async def create_local_storage(
384526
self,
385527
project_id: int,

0 commit comments

Comments
 (0)