modelscope · Yunnglin · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/evalscope/perf/plugin/datasets/kontext_bench.py b/evalscope/perf/plugin/datasets/kontext_bench.py
@@ -21,7 +21,7 @@ def build_messages(self) -> Iterator[List[Dict]]:
 
         for item in dataset:
             pil_image = item['image']
-            text = item['instruction']
+            text = item.get('instruction') or 'Describe the image'
             base64_image = PIL_to_base64(pil_image, add_header=True)
 
             message = self.create_message(text=text, image_urls=base64_image)

diff --git a/evalscope/report/renderer.py b/evalscope/report/renderer.py
@@ -257,7 +257,9 @@ def gen_html_report_file(
             overall_score = rpt.score
 
             if not rpt.metrics:
-                model_sections.append(dict(model_name=model, subset_rows=[], chart_div='', analysis_html=''))
+                model_sections.append(
+                    dict(model_name=model, subset_rows=[], show_category=False, chart_div='', analysis_html='')
+                )
                 continue
 
             main_metric = rpt.metrics[0]
@@ -266,13 +268,28 @@ def gen_html_report_file(
             subset_scores: List[float] = []
 
             for cat in main_metric.categories:
+                # Category name is stored as a tuple; join for display
+                cat_display = ' / '.join(cat.name) if cat.name else ''
                 for sub in cat.subsets:
                     if sub.name == ReportKey.overall_score:
                         continue
-                    subset_rows.append(dict(subset=sub.name, metric=main_metric.name, score=sub.score, num=sub.num))
+                    subset_rows.append(
+                        dict(
+                            subset=sub.name,
+                            category=cat_display,
+                            metric=main_metric.name,
+                            score=sub.score,
+                            num=sub.num,
+                        )
+                    )
                     subset_labels.append(sub.name)
                     subset_scores.append(sub.score)
 
+            # Determine whether the category column adds information
+            # (hide it when every row has the same category or no category)
+            unique_cats = {r['category'] for r in subset_rows}
+            show_category = len(unique_cats) > 1 or (len(unique_cats) == 1 and '' not in unique_cats)
+
             chart_div = _subset_chart_div(
                 ds=ds,
                 model=model,
@@ -285,7 +302,13 @@ def gen_html_report_file(
             analysis_raw = rpt.analysis if rpt.analysis and rpt.analysis.strip() not in ('', 'N/A') else ''
             analysis_html = _md_to_html(analysis_raw) if analysis_raw else ''
             model_sections.append(
-                dict(model_name=model, subset_rows=subset_rows, chart_div=chart_div, analysis_html=analysis_html)
+                dict(
+                    model_name=model,
+                    subset_rows=subset_rows,
+                    show_category=show_category,
+                    chart_div=chart_div,
+                    analysis_html=analysis_html,
+                )
             )
 
         dataset_sections.append(

diff --git a/evalscope/report/template/js/i18n_eval.js b/evalscope/report/template/js/i18n_eval.js
@@ -41,12 +41,13 @@ var I18N = {
     'card.subset_scores':  'Subset Scores',
 
     /* ── Table column headers ─────────────────── */
-    'col.dataset': 'Dataset',
-    'col.model':   'Model',
-    'col.metric':  'Metric',
-    'col.score':   'Score',
-    'col.num':     'Num',
-    'col.subset':  'Subset',
+    'col.dataset':   'Dataset',
+    'col.model':     'Model',
+    'col.metric':    'Metric',
+    'col.score':     'Score',
+    'col.num':       'Num',
+    'col.subset':    'Subset',
+    'col.category':  'Category',
 
     /* ── Inline badge suffixes ────────────────── */
     'badge.entries': 'entries',
@@ -82,12 +83,13 @@ var I18N = {
     'card.subset_scores':  '子集分数',
 
     /* ── Table column headers ─────────────────── */
-    'col.dataset': '数据集',
-    'col.model':   '模型',
-    'col.metric':  '指标',
-    'col.score':   '分数',
-    'col.num':     '数量',
-    'col.subset':  '子集',
+    'col.dataset':   '数据集',
+    'col.model':     '模型',
+    'col.metric':    '指标',
+    'col.score':     '分数',
+    'col.num':       '数量',
+    'col.subset':    '子集',
+    'col.category':  '类别',
 
     /* ── Inline badge suffixes ────────────────── */
     'badge.entries': '条记录',

diff --git a/evalscope/report/template/report.html.j2 b/evalscope/report/template/report.html.j2
@@ -158,6 +158,9 @@
               <table>
                 <thead>
                   <tr>
+                    {% if msec.show_category %}
+                    <th data-i18n="col.category">Category</th>
+                    {% endif %}
                     <th data-i18n="col.subset">Subset</th>
                     <th data-i18n="col.metric">Metric</th>
                     <th data-i18n="col.score">Score</th>
@@ -167,6 +170,9 @@
                 <tbody>
                   {% for row in msec.subset_rows %}
                   <tr>
+                    {% if msec.show_category %}
+                    <td><span class="tag">{{ row.category }}</span></td>
+                    {% endif %}
                     <td>{{ row.subset }}</td>
                     <td><span class="metric-tag">{{ row.metric }}</span></td>
                     <td class="score-cell">{{ "%.4f" | format(row.score) }}</td>

diff --git a/tests/perf/test_perf.py b/tests/perf/test_perf.py
@@ -216,7 +216,7 @@ def test_run_perf_vl(self):
             url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
             api_key=env.get('DASHSCOPE_API_KEY'),
             api='openai',
-            dataset='flickr8k',
+            dataset='kontext_bench',
             min_tokens=100,
             max_tokens=100,
             tokenizer_path='Qwen/Qwen2.5-VL-7B-Instruct',