[Feature] Return table for service and fix analysis bug (#1240)

Yunnglin · web-flow · commit 6e434286b5ae · 2026-03-20T15:28:24.000+08:00
* fix analyse

* update serve table

* update analys

* update analys
diff --git a/evalscope/metrics/llm_judge.py b/evalscope/metrics/llm_judge.py
@@ -84,7 +84,7 @@ def __init__(
         self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
         self.eval_type = eval_type or EvalType.OPENAI_API
         self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
-        self.generation_config = generation_config or {'temperature': 0.0, 'max_tokens': 1024}
+        self.generation_config = generation_config or {'temperature': 0.0, 'max_tokens': 4096}
 
         # Default score mapping for A/B pattern
         self.score_type = score_type
diff --git a/evalscope/report/report.py b/evalscope/report/report.py
@@ -245,17 +245,21 @@ def generate_analysis(self, task_config: 'TaskConfig') -> str:
                 judge_llm = LLMJudge(
                     api_key=task_config.api_key,
                     api_url=task_config.api_url,
-                    model_id=task_config.model_id,
+                    model_id=task_config.model,
                     eval_type=task_config.eval_type,
                 )
 
             prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str())
             response = judge_llm.judge(prompt)
-            if DEFAULT_LANGUAGE == 'en':
-                disclaimer = f'> *Generated by {judge_llm.model_id}, for reference only.*'
+            if response.startswith('[ERROR]'):
+                logger.warning(f'Analysis generation failed, skipping: {response}')
+                response = 'N/A'
             else:
-                disclaimer = f'> *由 {judge_llm.model_id} 生成，仅供参考。*'
-            response = f'{disclaimer}\n\n{response}'
+                if DEFAULT_LANGUAGE == 'en':
+                    disclaimer = f'> *Generated by {judge_llm.model_id}, for reference only.*'
+                else:
+                    disclaimer = f'> *由 {judge_llm.model_id} 生成，仅供参考。*'
+                response = f'{disclaimer}\n\n{response}'
         except Exception as e:
             logger.error(f'Error generating analysis: {e}')
             response = 'N/A'
diff --git a/evalscope/service/blueprints/eval.py b/evalscope/service/blueprints/eval.py
@@ -1,10 +1,12 @@
 import json
 import os
 from flask import Blueprint, current_app, jsonify, request, send_file
+from tabulate import tabulate
 from typing import Any, Dict, List
 
 from evalscope.config import TaskConfig
 from evalscope.constants import EvalType
+from evalscope.report.combinator import get_data_frame, get_report_list
 from evalscope.utils.logger import get_logger
 
 try:
@@ -36,6 +38,40 @@
 
 bp_eval = Blueprint('eval', __name__, url_prefix='/api/v1/eval')
 
+_COLUMN_ZH = {
+    'Model': '模型',
+    'Dataset': '数据集',
+    'Metric': '指标',
+    'Subset': '子集',
+    'Num': '数量',
+    'Score': '得分',
+}
+
+
+def _build_result_table(work_dir: str) -> str:
+    """Build a Markdown pipe-table from the JSON report files in *work_dir*/reports.
+
+    Returns an empty string when no reports are found or on any error.
+    """
+    try:
+        reports_dir = os.path.join(work_dir, 'reports')
+        report_list = get_report_list([reports_dir])
+        if not report_list:
+            return ''
+        df = get_data_frame(report_list, flatten_metrics=True, flatten_categories=True)
+        new_cols = {}
+        for col in df.columns:
+            if col in _COLUMN_ZH:
+                new_cols[col] = _COLUMN_ZH[col]
+            elif col.startswith('Cat.'):
+                new_cols[col] = col.replace('Cat.', '类别')
+        df = df.rename(columns=new_cols)
+        return tabulate(df, headers=df.columns, tablefmt='pipe', showindex=False)
+    except Exception as e:
+        logger.warning(f'Failed to build result table: {e}')
+        return ''
+
+
 _REQUIRED_FIELDS = ['model', 'datasets', 'api_url']
 
 
@@ -97,8 +133,9 @@ def _execute_task(task_id: str, task_config: TaskConfig, label: str = 'Task'):
     create_log_file(task_id, os.path.join('logs', 'eval_log.log'))
     try:
         result = run_in_subprocess(run_eval_wrapper, task_config)
+        table_str = _build_result_table(task_config.work_dir)
         logger.info(f'[{task_id}] {label} completed successfully')
-        return jsonify({'status': 'completed', 'task_id': task_id, 'result': result})
+        return jsonify({'status': 'completed', 'task_id': task_id, 'result': result, 'table': table_str})
     except Exception as e:
         logger.error(f'[{task_id}] {label} failed: {e}')
         return jsonify({'status': 'error', 'task_id': task_id, 'error': str(e)}), 500
diff --git a/evalscope/service/blueprints/perf.py b/evalscope/service/blueprints/perf.py
@@ -1,8 +1,10 @@
 import json
 import os
 from flask import Blueprint, jsonify, request, send_file
+from tabulate import tabulate
 
 from evalscope.perf.arguments import Arguments as PerfArguments
+from evalscope.perf.utils.rich_display import analyze_results
 from evalscope.utils.logger import get_logger
 
 try:
@@ -26,6 +28,29 @@
 
 logger = get_logger()
 
+
+def _build_perf_table(result, api_type: str = None) -> str:
+    """Build a Markdown pipe-table from perf benchmark results with Chinese headers.
+
+    Returns an empty string when no valid results are found.
+    """
+    try:
+        summary, _tokens, _time, is_embedding_rerank = analyze_results(result, api_type=api_type)
+        if not summary:
+            return ''
+        if is_embedding_rerank:
+            headers = ['并发数', '请求速率', '每秒请求数', '平均延迟(s)', 'P99延迟(s)', '平均输入TPS', 'P99输入TPS', '平均输入Token数', '成功率']
+        else:
+            headers = [
+                '并发数', '请求速率', '每秒请求数', '平均延迟(s)', 'P99延迟(s)', '平均首字延迟(s)', 'P99首字延迟(s)', '平均每Token延迟(s)',
+                'P99每Token延迟(s)', '生成速度(toks/s)', '成功率'
+            ]
+        return tabulate(summary, headers=headers, tablefmt='pipe')
+    except Exception as e:
+        logger.warning(f'Failed to build perf table: {e}')
+        return ''
+
+
 bp_perf = Blueprint('perf', __name__, url_prefix='/api/v1/perf')
 
 
@@ -69,8 +94,9 @@ def run_performance_test():
 
     try:
         result = run_in_subprocess(run_perf_wrapper, perf_args)
+        table_str = _build_perf_table(result, api_type=perf_args.api)
         logger.info(f'[{task_id}] Task completed successfully')
-        return jsonify({'status': 'completed', 'task_id': task_id, 'result': result})
+        return jsonify({'status': 'completed', 'task_id': task_id, 'result': result, 'table': table_str})
     except Exception as e:
         logger.error(f'[{task_id}] Task failed: {e}')
         return jsonify({'status': 'error', 'task_id': task_id, 'error': str(e)}), 500