modelscope · Yunnglin · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/evalscope/benchmarks/refcoco/refcoco_adapter.py b/evalscope/benchmarks/refcoco/refcoco_adapter.py
@@ -109,6 +109,7 @@ def record_to_sample(self, record: Dict[str, Any]) -> Union[Sample, List[Sample]
 
         bbox_norm = []
         image_base64 = None
+        image_size = None
 
         if image_bytes:
             image_data = Image.open(io.BytesIO(image_bytes))
@@ -120,6 +121,7 @@ def record_to_sample(self, record: Dict[str, Any]) -> Union[Sample, List[Sample]
                 x, y, bw, bh = original_bbox
                 bbox_norm = [x / w, y / h, (x + bw) / w, (y + bh) / h]
                 image_data = image_data.convert('RGB')
+                image_size = (w, h)
             elif self.eval_mode == EvalMode.BBOX:
                 image_data = refcoco_bbox_doc_to_visual(image_data, original_bbox)
             elif self.eval_mode == EvalMode.SEG:
@@ -139,7 +141,8 @@ def record_to_sample(self, record: Dict[str, Any]) -> Union[Sample, List[Sample]
             'answer': record.get('answer'),
             'original_bbox': original_bbox,
             'bbox': bbox_norm,
-            'eval_mode': self.eval_mode
+            'eval_mode': self.eval_mode,
+            'image_size': image_size if self.eval_mode == EvalMode.BBOX_REC else None
         }
 
         if self.eval_mode == EvalMode.BBOX_REC:
@@ -183,7 +186,14 @@ def extract_answer(self, prediction: str, task_state: TaskState) -> Union[str, L
             match = re.search(pattern, prediction)
 
             if match:
-                return [float(match.group(i)) for i in range(1, 5)]
+                coords = [float(match.group(i)) for i in range(1, 5)]
+                # If any coordinate > 1, treat as pixel coordinates and normalize
+                image_size = task_state.metadata.get('image_size')
+                if image_size and any(c > 1.0 for c in coords):
+                    w, h = image_size
+                    coords = [coords[0] / w, coords[1] / h, coords[2] / w, coords[3] / h]
+                    logger.debug(f'Normalized pixel coords to: {coords} using image size {image_size}')
+                return coords
             return [0.0, 0.0, 0.0, 0.0]
         else:
             return prediction

diff --git a/evalscope/perf/plugin/datasets/kontext_bench.py b/evalscope/perf/plugin/datasets/kontext_bench.py
@@ -21,7 +21,7 @@ def build_messages(self) -> Iterator[List[Dict]]:
 
         for item in dataset:
             pil_image = item['image']
-            text = item['instruction']
+            text = item.get('instruction') or 'Describe the image'
             base64_image = PIL_to_base64(pil_image, add_header=True)
 
             message = self.create_message(text=text, image_urls=base64_image)

diff --git a/evalscope/report/renderer.py b/evalscope/report/renderer.py
@@ -257,7 +257,9 @@ def gen_html_report_file(
             overall_score = rpt.score
 
             if not rpt.metrics:
-                model_sections.append(dict(model_name=model, subset_rows=[], chart_div='', analysis_html=''))
+                model_sections.append(
+                    dict(model_name=model, subset_rows=[], show_category=False, chart_div='', analysis_html='')
+                )
                 continue
 
             main_metric = rpt.metrics[0]
@@ -266,13 +268,25 @@ def gen_html_report_file(
             subset_scores: List[float] = []
 
             for cat in main_metric.categories:
+                # Category name is stored as a tuple; join for display
+                cat_display = ' / '.join(cat.name) if cat.name else ''
                 for sub in cat.subsets:
                     if sub.name == ReportKey.overall_score:
                         continue
-                    subset_rows.append(dict(subset=sub.name, metric=main_metric.name, score=sub.score, num=sub.num))
+                    subset_rows.append(
+                        dict(
+                            subset=sub.name,
+                            category=cat_display,
+                            metric=main_metric.name,
+                            score=sub.score,
+                            num=sub.num,
+                        )
+                    )
                     subset_labels.append(sub.name)
                     subset_scores.append(sub.score)
 
+            show_category = True
+
             chart_div = _subset_chart_div(
                 ds=ds,
                 model=model,
@@ -285,7 +299,13 @@ def gen_html_report_file(
             analysis_raw = rpt.analysis if rpt.analysis and rpt.analysis.strip() not in ('', 'N/A') else ''
             analysis_html = _md_to_html(analysis_raw) if analysis_raw else ''
             model_sections.append(
-                dict(model_name=model, subset_rows=subset_rows, chart_div=chart_div, analysis_html=analysis_html)
+                dict(
+                    model_name=model,
+                    subset_rows=subset_rows,
+                    show_category=show_category,
+                    chart_div=chart_div,
+                    analysis_html=analysis_html,
+                )
             )
 
         dataset_sections.append(

diff --git a/evalscope/report/template/js/i18n_eval.js b/evalscope/report/template/js/i18n_eval.js
@@ -41,12 +41,13 @@ var I18N = {
     'card.subset_scores':  'Subset Scores',
 
     /* ── Table column headers ─────────────────── */
-    'col.dataset': 'Dataset',
-    'col.model':   'Model',
-    'col.metric':  'Metric',
-    'col.score':   'Score',
-    'col.num':     'Num',
-    'col.subset':  'Subset',
+    'col.dataset':   'Dataset',
+    'col.model':     'Model',
+    'col.metric':    'Metric',
+    'col.score':     'Score',
+    'col.num':       'Num',
+    'col.subset':    'Subset',
+    'col.category':  'Category',
 
     /* ── Inline badge suffixes ────────────────── */
     'badge.entries': 'entries',
@@ -82,12 +83,13 @@ var I18N = {
     'card.subset_scores':  '子集分数',
 
     /* ── Table column headers ─────────────────── */
-    'col.dataset': '数据集',
-    'col.model':   '模型',
-    'col.metric':  '指标',
-    'col.score':   '分数',
-    'col.num':     '数量',
-    'col.subset':  '子集',
+    'col.dataset':   '数据集',
+    'col.model':     '模型',
+    'col.metric':    '指标',
+    'col.score':     '分数',
+    'col.num':       '数量',
+    'col.subset':    '子集',
+    'col.category':  '类别',
 
     /* ── Inline badge suffixes ────────────────── */
     'badge.entries': '条记录',

diff --git a/evalscope/report/template/report.html.j2 b/evalscope/report/template/report.html.j2
@@ -158,6 +158,9 @@
               <table>
                 <thead>
                   <tr>
+                    {% if msec.show_category %}
+                    <th data-i18n="col.category">Category</th>
+                    {% endif %}
                     <th data-i18n="col.subset">Subset</th>
                     <th data-i18n="col.metric">Metric</th>
                     <th data-i18n="col.score">Score</th>
@@ -167,6 +170,9 @@
                 <tbody>
                   {% for row in msec.subset_rows %}
                   <tr>
+                    {% if msec.show_category %}
+                    <td><span class="tag">{{ row.category }}</span></td>
+                    {% endif %}
                     <td>{{ row.subset }}</td>
                     <td><span class="metric-tag">{{ row.metric }}</span></td>
                     <td class="score-cell">{{ "%.4f" | format(row.score) }}</td>

diff --git a/tests/benchmark/test_eval.py b/tests/benchmark/test_eval.py
@@ -156,7 +156,7 @@ def test_arc(self):
     def test_ceval(self):
         """Test CEval dataset."""
         dataset_args = {
-            'subset_list': ['logic', 'law'],
+            'subset_list': ['logic', 'law', 'computer_network'],
             # 'few_shot_num': 0,
         }
         # self._run_dataset_load_test('ceval')

diff --git a/tests/perf/test_perf.py b/tests/perf/test_perf.py
@@ -216,7 +216,7 @@ def test_run_perf_vl(self):
             url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
             api_key=env.get('DASHSCOPE_API_KEY'),
             api='openai',
-            dataset='flickr8k',
+            dataset='kontext_bench',
             min_tokens=100,
             max_tokens=100,
             tokenizer_path='Qwen/Qwen2.5-VL-7B-Instruct',