Skip to content

Commit 6e43428

Browse files
authored
[Feature] Return table for service and fix analysis bug (#1240)
* fix analyse * update serve table * update analys * update analys
1 parent ade91fa commit 6e43428

File tree

4 files changed

+75
-8
lines changed

4 files changed

+75
-8
lines changed

evalscope/metrics/llm_judge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def __init__(
8484
self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
8585
self.eval_type = eval_type or EvalType.OPENAI_API
8686
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
87-
self.generation_config = generation_config or {'temperature': 0.0, 'max_tokens': 1024}
87+
self.generation_config = generation_config or {'temperature': 0.0, 'max_tokens': 4096}
8888

8989
# Default score mapping for A/B pattern
9090
self.score_type = score_type

evalscope/report/report.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -245,17 +245,21 @@ def generate_analysis(self, task_config: 'TaskConfig') -> str:
245245
judge_llm = LLMJudge(
246246
api_key=task_config.api_key,
247247
api_url=task_config.api_url,
248-
model_id=task_config.model_id,
248+
model_id=task_config.model,
249249
eval_type=task_config.eval_type,
250250
)
251251

252252
prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str())
253253
response = judge_llm.judge(prompt)
254-
if DEFAULT_LANGUAGE == 'en':
255-
disclaimer = f'> *Generated by {judge_llm.model_id}, for reference only.*'
254+
if response.startswith('[ERROR]'):
255+
logger.warning(f'Analysis generation failed, skipping: {response}')
256+
response = 'N/A'
256257
else:
257-
disclaimer = f'> *由 {judge_llm.model_id} 生成,仅供参考。*'
258-
response = f'{disclaimer}\n\n{response}'
258+
if DEFAULT_LANGUAGE == 'en':
259+
disclaimer = f'> *Generated by {judge_llm.model_id}, for reference only.*'
260+
else:
261+
disclaimer = f'> *由 {judge_llm.model_id} 生成,仅供参考。*'
262+
response = f'{disclaimer}\n\n{response}'
259263
except Exception as e:
260264
logger.error(f'Error generating analysis: {e}')
261265
response = 'N/A'

evalscope/service/blueprints/eval.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import json
22
import os
33
from flask import Blueprint, current_app, jsonify, request, send_file
4+
from tabulate import tabulate
45
from typing import Any, Dict, List
56

67
from evalscope.config import TaskConfig
78
from evalscope.constants import EvalType
9+
from evalscope.report.combinator import get_data_frame, get_report_list
810
from evalscope.utils.logger import get_logger
911

1012
try:
@@ -36,6 +38,40 @@
3638

3739
bp_eval = Blueprint('eval', __name__, url_prefix='/api/v1/eval')
3840

41+
_COLUMN_ZH = {
42+
'Model': '模型',
43+
'Dataset': '数据集',
44+
'Metric': '指标',
45+
'Subset': '子集',
46+
'Num': '数量',
47+
'Score': '得分',
48+
}
49+
50+
51+
def _build_result_table(work_dir: str) -> str:
52+
"""Build a Markdown pipe-table from the JSON report files in *work_dir*/reports.
53+
54+
Returns an empty string when no reports are found or on any error.
55+
"""
56+
try:
57+
reports_dir = os.path.join(work_dir, 'reports')
58+
report_list = get_report_list([reports_dir])
59+
if not report_list:
60+
return ''
61+
df = get_data_frame(report_list, flatten_metrics=True, flatten_categories=True)
62+
new_cols = {}
63+
for col in df.columns:
64+
if col in _COLUMN_ZH:
65+
new_cols[col] = _COLUMN_ZH[col]
66+
elif col.startswith('Cat.'):
67+
new_cols[col] = col.replace('Cat.', '类别')
68+
df = df.rename(columns=new_cols)
69+
return tabulate(df, headers=df.columns, tablefmt='pipe', showindex=False)
70+
except Exception as e:
71+
logger.warning(f'Failed to build result table: {e}')
72+
return ''
73+
74+
3975
_REQUIRED_FIELDS = ['model', 'datasets', 'api_url']
4076

4177

@@ -97,8 +133,9 @@ def _execute_task(task_id: str, task_config: TaskConfig, label: str = 'Task'):
97133
create_log_file(task_id, os.path.join('logs', 'eval_log.log'))
98134
try:
99135
result = run_in_subprocess(run_eval_wrapper, task_config)
136+
table_str = _build_result_table(task_config.work_dir)
100137
logger.info(f'[{task_id}] {label} completed successfully')
101-
return jsonify({'status': 'completed', 'task_id': task_id, 'result': result})
138+
return jsonify({'status': 'completed', 'task_id': task_id, 'result': result, 'table': table_str})
102139
except Exception as e:
103140
logger.error(f'[{task_id}] {label} failed: {e}')
104141
return jsonify({'status': 'error', 'task_id': task_id, 'error': str(e)}), 500

evalscope/service/blueprints/perf.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import json
22
import os
33
from flask import Blueprint, jsonify, request, send_file
4+
from tabulate import tabulate
45

56
from evalscope.perf.arguments import Arguments as PerfArguments
7+
from evalscope.perf.utils.rich_display import analyze_results
68
from evalscope.utils.logger import get_logger
79

810
try:
@@ -26,6 +28,29 @@
2628

2729
logger = get_logger()
2830

31+
32+
def _build_perf_table(result, api_type: str = None) -> str:
33+
"""Build a Markdown pipe-table from perf benchmark results with Chinese headers.
34+
35+
Returns an empty string when no valid results are found.
36+
"""
37+
try:
38+
summary, _tokens, _time, is_embedding_rerank = analyze_results(result, api_type=api_type)
39+
if not summary:
40+
return ''
41+
if is_embedding_rerank:
42+
headers = ['并发数', '请求速率', '每秒请求数', '平均延迟(s)', 'P99延迟(s)', '平均输入TPS', 'P99输入TPS', '平均输入Token数', '成功率']
43+
else:
44+
headers = [
45+
'并发数', '请求速率', '每秒请求数', '平均延迟(s)', 'P99延迟(s)', '平均首字延迟(s)', 'P99首字延迟(s)', '平均每Token延迟(s)',
46+
'P99每Token延迟(s)', '生成速度(toks/s)', '成功率'
47+
]
48+
return tabulate(summary, headers=headers, tablefmt='pipe')
49+
except Exception as e:
50+
logger.warning(f'Failed to build perf table: {e}')
51+
return ''
52+
53+
2954
bp_perf = Blueprint('perf', __name__, url_prefix='/api/v1/perf')
3055

3156

@@ -69,8 +94,9 @@ def run_performance_test():
6994

7095
try:
7196
result = run_in_subprocess(run_perf_wrapper, perf_args)
97+
table_str = _build_perf_table(result, api_type=perf_args.api)
7298
logger.info(f'[{task_id}] Task completed successfully')
73-
return jsonify({'status': 'completed', 'task_id': task_id, 'result': result})
99+
return jsonify({'status': 'completed', 'task_id': task_id, 'result': result, 'table': table_str})
74100
except Exception as e:
75101
logger.error(f'[{task_id}] Task failed: {e}')
76102
return jsonify({'status': 'error', 'task_id': task_id, 'error': str(e)}), 500

0 commit comments

Comments
 (0)