Skip to content

Commit fba0574

Browse files
committed
feat: AI summary in Model Arena
1 parent 69ab6fb commit fba0574

File tree

27 files changed

+1296
-618
lines changed

27 files changed

+1296
-618
lines changed

backend/.flake8

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,12 @@ extend-ignore =
2727
D415,
2828
# B008: Do not perform function calls in argument defaults (common in FastAPI)
2929
B008,
30+
# B010: Do not call setattr with a constant attribute value (SQLAlchemy models need this)
31+
B010,
3032
# F401: imported but unused (handled by isort and IDE)
3133
F401,
34+
# F541: f-string is missing placeholders (sometimes intentional for logging)
35+
F541,
3236
# I100: Import statements are in the wrong order (handled by isort)
3337
I100,
3438
# I101: Imported names are in the wrong order (handled by isort)
@@ -62,4 +66,4 @@ per-file-ignores =
6266
# Fix script can have unused imports
6367
fix_code_*.py:F401
6468

65-
max-complexity = 15
69+
max-complexity = 20

backend/api/api_analysis.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,21 @@
77
from fastapi.responses import JSONResponse
88

99
from model.analysis import AnalysisRequest, AnalysisResponse, GetAnalysisResponse
10-
from service.analysis_service import analyze_task_svc, get_analysis_svc
10+
from service.analysis_service import analyze_tasks_svc, get_analysis_svc
1111
from utils.logger import logger
1212

1313
# Create an API router for analysis-related endpoints
1414
router = APIRouter()
1515

1616

17-
@router.post("/{task_id}", response_model=AnalysisResponse)
18-
async def analyze_task(
19-
request: Request, task_id: str, analysis_request: AnalysisRequest
20-
):
17+
@router.post("", response_model=AnalysisResponse)
18+
async def analyze_tasks(request: Request, analysis_request: AnalysisRequest):
2119
"""
22-
Perform AI analysis on task results.
20+
Perform AI analysis on task results (single or multiple tasks).
2321
2422
Args:
2523
request: The incoming request.
26-
task_id: The task ID to analyze (from URL path).
27-
analysis_request: The analysis request (optional eval_prompt).
24+
analysis_request: The analysis request containing task_ids and options.
2825
2926
Returns:
3027
AnalysisResponse: The analysis result.
@@ -33,7 +30,7 @@ async def analyze_task(
3330
HTTPException: If the task doesn't exist or analysis fails.
3431
"""
3532

36-
return await analyze_task_svc(request, task_id, analysis_request)
33+
return await analyze_tasks_svc(request, analysis_request)
3734

3835

3936
@router.get("/{task_id}", response_model=GetAnalysisResponse)

backend/db/init_db.sql

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,45 @@ CREATE TABLE `task_results` (
6565
KEY `idx_task_id` (`task_id`)
6666
) ENGINE=InnoDB AUTO_INCREMENT=262 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
6767

68+
-- ----------------------------
69+
-- Table structure for test_insights (AI Analysis)
70+
-- ----------------------------
71+
DROP TABLE IF EXISTS `test_insights`;
72+
CREATE TABLE `test_insights` (
73+
`id` int(11) NOT NULL AUTO_INCREMENT,
74+
`task_id` varchar(36) COLLATE utf8mb4_unicode_ci NOT NULL,
75+
`eval_prompt` text COLLATE utf8mb4_unicode_ci NOT NULL,
76+
`analysis_report` longtext COLLATE utf8mb4_unicode_ci NOT NULL,
77+
`status` varchar(20) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT 'pending' COMMENT 'pending, processing, completed, failed',
78+
`error_message` text COLLATE utf8mb4_unicode_ci,
79+
`created_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
80+
`updated_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
81+
PRIMARY KEY (`id`),
82+
UNIQUE KEY `task_id` (`task_id`),
83+
KEY `idx_status` (`status`),
84+
KEY `idx_created_at` (`created_at`)
85+
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
86+
87+
-- ----------------------------
88+
-- Table structure for analysis_jobs (Background Analysis Jobs)
89+
-- ----------------------------
90+
DROP TABLE IF EXISTS `analysis_jobs`;
91+
CREATE TABLE `analysis_jobs` (
92+
`id` varchar(36) COLLATE utf8mb4_unicode_ci NOT NULL,
93+
`task_ids` text COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'JSON string of task IDs',
94+
`analysis_type` int(11) NOT NULL COMMENT '0=single task, 1=multiple tasks',
95+
`language` varchar(10) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT 'en',
96+
`eval_prompt` text COLLATE utf8mb4_unicode_ci,
97+
`status` varchar(20) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT 'pending' COMMENT 'pending, processing, completed, failed',
98+
`result_data` longtext COLLATE utf8mb4_unicode_ci COMMENT 'JSON string of analysis result',
99+
`error_message` text COLLATE utf8mb4_unicode_ci,
100+
`created_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
101+
`updated_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
102+
PRIMARY KEY (`id`),
103+
KEY `idx_status` (`status`),
104+
KEY `idx_created_at` (`created_at`),
105+
KEY `idx_analysis_type` (`analysis_type`)
106+
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
107+
68108
-- 最后重新启用外键检查
69-
SET FOREIGN_KEY_CHECKS = 1;
109+
SET FOREIGN_KEY_CHECKS = 1;

backend/model/analysis.py

Lines changed: 76 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@
33
Copyright (c) 2025, All Rights Reserved.
44
"""
55

6-
from typing import Optional, Union
6+
from datetime import datetime
7+
from typing import List, Optional, Union
8+
from uuid import uuid4
79

810
from pydantic import BaseModel, Field
9-
from sqlalchemy import Column, DateTime, Integer, String, Text, func
11+
from sqlalchemy import Column, DateTime, Integer, String, Text
12+
from sqlalchemy.sql import func
1013

1114
from db.mysql import Base
1215

@@ -17,60 +20,103 @@ class TaskAnalysis(Base):
1720
"""
1821

1922
__tablename__ = "test_insights"
20-
id = Column(Integer, primary_key=True, index=True)
21-
task_id = Column(String(40), nullable=False, unique=True)
23+
24+
id = Column(Integer, primary_key=True, autoincrement=True)
25+
task_id = Column(String(36), nullable=False, unique=True)
2226
eval_prompt = Column(Text, nullable=False)
2327
analysis_report = Column(Text, nullable=False)
24-
status = Column(String(20), nullable=False, default="completed")
28+
status = Column(
29+
String(20), nullable=False, default="pending"
30+
) # pending, processing, completed, failed
2531
error_message = Column(Text, nullable=True)
26-
created_at = Column(DateTime, server_default=func.now())
27-
updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
32+
created_at = Column(DateTime, nullable=False, server_default=func.now())
33+
updated_at = Column(
34+
DateTime, nullable=False, server_default=func.now(), onupdate=func.now()
35+
)
36+
37+
38+
class AnalysisJob(Base):
39+
"""
40+
SQLAlchemy model for tracking background analysis jobs.
41+
"""
42+
43+
__tablename__ = "analysis_jobs"
44+
45+
id = Column(String(36), primary_key=True, default=lambda: str(uuid4()))
46+
task_ids = Column(Text, nullable=False) # JSON string of task IDs
47+
analysis_type = Column(Integer, nullable=False) # 0=single, 1=multiple
48+
language = Column(String(10), nullable=False, default="en")
49+
eval_prompt = Column(Text, nullable=True)
50+
status = Column(
51+
String(20), nullable=False, default="pending"
52+
) # pending, processing, completed, failed
53+
result_data = Column(Text, nullable=True) # JSON string of analysis result
54+
error_message = Column(Text, nullable=True)
55+
created_at = Column(DateTime, nullable=False, server_default=func.now())
56+
updated_at = Column(
57+
DateTime, nullable=False, server_default=func.now(), onupdate=func.now()
58+
)
2859

2960

3061
class AnalysisRequest(BaseModel):
3162
"""
32-
Request model for AI analysis.
63+
Request model for AI analysis (single or multiple tasks).
64+
"""
65+
66+
task_ids: List[str] = Field(..., description="List of task IDs to analyze")
67+
eval_prompt: Optional[str] = Field(
68+
None, description="Custom evaluation prompt for analysis"
69+
)
70+
language: Optional[str] = Field("en", description="Language for analysis report")
71+
background: Optional[bool] = Field(False, description="Process in background")
72+
3373

34-
Attributes:
35-
task_id: The task ID to analyze.
36-
language: The language for analysis prompt (en/zh).
74+
class AnalysisJobRequest(BaseModel):
75+
"""
76+
Request model for starting a background analysis job.
3777
"""
3878

39-
eval_prompt: Optional[str] = Field(None, description="Custom evaluation prompt")
40-
language: Optional[str] = Field(
41-
"en", description="Language for analysis prompt (en/zh)"
79+
task_ids: List[str] = Field(..., description="List of task IDs to analyze")
80+
eval_prompt: Optional[str] = Field(
81+
None, description="Custom evaluation prompt for analysis"
4282
)
83+
language: Optional[str] = Field("en", description="Language for analysis report")
4384

4485

4586
class AnalysisResponse(BaseModel):
4687
"""
4788
Response model for AI analysis.
48-
49-
Attributes:
50-
task_id: The task ID.
51-
analysis_report: The AI analysis content.
52-
status: The analysis status.
53-
error_message: Error message if analysis failed.
54-
created_at: The creation timestamp.
5589
"""
5690

57-
task_id: str
91+
task_ids: List[str]
5892
analysis_report: str
5993
status: str
6094
error_message: Optional[str] = None
6195
created_at: str
96+
job_id: Optional[str] = Field(
97+
None, description="Background job ID if processed asynchronously"
98+
)
6299

63100

64-
class GetAnalysisResponse(BaseModel):
101+
class AnalysisJobResponse(BaseModel):
102+
"""
103+
Response model for background analysis job status.
65104
"""
66-
Response model for getting analysis results.
67105

68-
Attributes:
69-
data: The analysis data.
70-
status: The status of the response.
71-
error: An error message if the request failed, otherwise None.
106+
job_id: str
107+
task_ids: List[str]
108+
status: str
109+
result_data: Optional[str] = None
110+
error_message: Optional[str] = None
111+
created_at: str
112+
updated_at: str
113+
114+
115+
class GetAnalysisResponse(BaseModel):
116+
"""
117+
Response model for getting AI analysis.
72118
"""
73119

74-
data: Optional[AnalysisResponse] = None
120+
data: Union[AnalysisResponse, None]
75121
status: str
76-
error: Union[str, None]
122+
error: Optional[str] = None

backend/model/task.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -218,13 +218,15 @@ class ModelTaskInfo(BaseModel):
218218
task_id: The task ID.
219219
task_name: The task name.
220220
created_at: The creation timestamp.
221+
duration: The test duration in seconds.
221222
"""
222223

223224
model_name: str
224225
concurrent_users: int
225226
task_id: str
226227
task_name: str
227228
created_at: str
229+
duration: int
228230

229231

230232
class ComparisonRequest(BaseModel):
@@ -249,25 +251,31 @@ class ComparisonMetrics(BaseModel):
249251
model_name: The model name.
250252
concurrent_users: The number of concurrent users.
251253
task_name: The task name.
252-
ttft: Time to first token (avg_latency in seconds).
254+
duration: Test duration with 's' suffix.
255+
stream_mode: Whether stream mode is enabled.
256+
dataset_type: Type of dataset used.
257+
first_token_latency: Time to first token (avg_latency in seconds).
258+
total_time: Total time for request completion.
253259
total_tps: Total tokens per second.
254260
completion_tps: Completion tokens per second.
255-
avg_total_tpr: Average total tokens per request.
256-
avg_completion_tpr: Average completion tokens per request.
257-
avg_response_time: Average response time.
261+
avg_total_tokens_per_req: Average total tokens per request.
262+
avg_completion_tokens_per_req: Average completion tokens per request.
258263
rps: Requests per second.
259264
"""
260265

261266
task_id: str
262267
model_name: str
263268
concurrent_users: int
264269
task_name: str
265-
ttft: float
270+
duration: str
271+
stream_mode: bool
272+
dataset_type: str
273+
first_token_latency: float
274+
total_time: float
266275
total_tps: float
267276
completion_tps: float
268-
avg_total_tpr: float
269-
avg_completion_tpr: float
270-
avg_response_time: float
277+
avg_total_tokens_per_req: float
278+
avg_completion_tokens_per_req: float
271279
rps: float
272280

273281

backend/mypy.ini

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ disallow_untyped_defs = False
99
disallow_incomplete_defs = False
1010
check_untyped_defs = False
1111

12+
# Fix module resolution issues
13+
namespace_packages = True
14+
explicit_package_bases = True
15+
1216
# ignore SQLAlchemy module import errors
1317
[mypy-sqlalchemy.*]
1418
ignore_missing_imports = True

0 commit comments

Comments
 (0)