Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions evalscope/benchmarks/refcoco/refcoco_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def record_to_sample(self, record: Dict[str, Any]) -> Union[Sample, List[Sample]

bbox_norm = []
image_base64 = None
image_size = None

if image_bytes:
image_data = Image.open(io.BytesIO(image_bytes))
Expand All @@ -120,6 +121,7 @@ def record_to_sample(self, record: Dict[str, Any]) -> Union[Sample, List[Sample]
x, y, bw, bh = original_bbox
bbox_norm = [x / w, y / h, (x + bw) / w, (y + bh) / h]
image_data = image_data.convert('RGB')
image_size = (w, h)
elif self.eval_mode == EvalMode.BBOX:
image_data = refcoco_bbox_doc_to_visual(image_data, original_bbox)
elif self.eval_mode == EvalMode.SEG:
Expand All @@ -139,7 +141,8 @@ def record_to_sample(self, record: Dict[str, Any]) -> Union[Sample, List[Sample]
'answer': record.get('answer'),
'original_bbox': original_bbox,
'bbox': bbox_norm,
'eval_mode': self.eval_mode
'eval_mode': self.eval_mode,
'image_size': image_size if self.eval_mode == EvalMode.BBOX_REC else None
}

if self.eval_mode == EvalMode.BBOX_REC:
Expand Down Expand Up @@ -183,7 +186,14 @@ def extract_answer(self, prediction: str, task_state: TaskState) -> Union[str, L
match = re.search(pattern, prediction)

if match:
return [float(match.group(i)) for i in range(1, 5)]
coords = [float(match.group(i)) for i in range(1, 5)]
# If any coordinate > 1, treat as pixel coordinates and normalize
image_size = task_state.metadata.get('image_size')
if image_size and any(c > 1.0 for c in coords):
w, h = image_size
coords = [coords[0] / w, coords[1] / h, coords[2] / w, coords[3] / h]
logger.debug(f'Normalized pixel coords to: {coords} using image size {image_size}')
return coords
return [0.0, 0.0, 0.0, 0.0]
else:
return prediction
Expand Down
2 changes: 1 addition & 1 deletion evalscope/perf/plugin/datasets/kontext_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def build_messages(self) -> Iterator[List[Dict]]:

for item in dataset:
pil_image = item['image']
text = item['instruction']
text = item.get('instruction') or 'Describe the image'
base64_image = PIL_to_base64(pil_image, add_header=True)

message = self.create_message(text=text, image_urls=base64_image)
Expand Down
26 changes: 23 additions & 3 deletions evalscope/report/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,9 @@ def gen_html_report_file(
overall_score = rpt.score

if not rpt.metrics:
model_sections.append(dict(model_name=model, subset_rows=[], chart_div='', analysis_html=''))
model_sections.append(
dict(model_name=model, subset_rows=[], show_category=False, chart_div='', analysis_html='')
)
continue

main_metric = rpt.metrics[0]
Expand All @@ -266,13 +268,25 @@ def gen_html_report_file(
subset_scores: List[float] = []

for cat in main_metric.categories:
# Category name is stored as a tuple; join for display
cat_display = ' / '.join(cat.name) if cat.name else ''
for sub in cat.subsets:
if sub.name == ReportKey.overall_score:
continue
subset_rows.append(dict(subset=sub.name, metric=main_metric.name, score=sub.score, num=sub.num))
subset_rows.append(
dict(
subset=sub.name,
category=cat_display,
metric=main_metric.name,
score=sub.score,
num=sub.num,
)
)
subset_labels.append(sub.name)
subset_scores.append(sub.score)

show_category = True

chart_div = _subset_chart_div(
ds=ds,
model=model,
Expand All @@ -285,7 +299,13 @@ def gen_html_report_file(
analysis_raw = rpt.analysis if rpt.analysis and rpt.analysis.strip() not in ('', 'N/A') else ''
analysis_html = _md_to_html(analysis_raw) if analysis_raw else ''
model_sections.append(
dict(model_name=model, subset_rows=subset_rows, chart_div=chart_div, analysis_html=analysis_html)
dict(
model_name=model,
subset_rows=subset_rows,
show_category=show_category,
chart_div=chart_div,
analysis_html=analysis_html,
)
)

dataset_sections.append(
Expand Down
26 changes: 14 additions & 12 deletions evalscope/report/template/js/i18n_eval.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,13 @@ var I18N = {
'card.subset_scores': 'Subset Scores',

/* ── Table column headers ─────────────────── */
'col.dataset': 'Dataset',
'col.model': 'Model',
'col.metric': 'Metric',
'col.score': 'Score',
'col.num': 'Num',
'col.subset': 'Subset',
'col.dataset': 'Dataset',
'col.model': 'Model',
'col.metric': 'Metric',
'col.score': 'Score',
'col.num': 'Num',
'col.subset': 'Subset',
'col.category': 'Category',

/* ── Inline badge suffixes ────────────────── */
'badge.entries': 'entries',
Expand Down Expand Up @@ -82,12 +83,13 @@ var I18N = {
'card.subset_scores': '子集分数',

/* ── Table column headers ─────────────────── */
'col.dataset': '数据集',
'col.model': '模型',
'col.metric': '指标',
'col.score': '分数',
'col.num': '数量',
'col.subset': '子集',
'col.dataset': '数据集',
'col.model': '模型',
'col.metric': '指标',
'col.score': '分数',
'col.num': '数量',
'col.subset': '子集',
'col.category': '类别',

/* ── Inline badge suffixes ────────────────── */
'badge.entries': '条记录',
Expand Down
6 changes: 6 additions & 0 deletions evalscope/report/template/report.html.j2
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@
<table>
<thead>
<tr>
{% if msec.show_category %}
<th data-i18n="col.category">Category</th>
{% endif %}
<th data-i18n="col.subset">Subset</th>
<th data-i18n="col.metric">Metric</th>
<th data-i18n="col.score">Score</th>
Expand All @@ -167,6 +170,9 @@
<tbody>
{% for row in msec.subset_rows %}
<tr>
{% if msec.show_category %}
<td><span class="tag">{{ row.category }}</span></td>
{% endif %}
<td>{{ row.subset }}</td>
<td><span class="metric-tag">{{ row.metric }}</span></td>
<td class="score-cell">{{ "%.4f" | format(row.score) }}</td>
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmark/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def test_arc(self):
def test_ceval(self):
"""Test CEval dataset."""
dataset_args = {
'subset_list': ['logic', 'law'],
'subset_list': ['logic', 'law', 'computer_network'],
# 'few_shot_num': 0,
}
# self._run_dataset_load_test('ceval')
Expand Down
2 changes: 1 addition & 1 deletion tests/perf/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def test_run_perf_vl(self):
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
api_key=env.get('DASHSCOPE_API_KEY'),
api='openai',
dataset='flickr8k',
dataset='kontext_bench',
min_tokens=100,
max_tokens=100,
tokenizer_path='Qwen/Qwen2.5-VL-7B-Instruct',
Expand Down
Loading