Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion evalscope/perf/plugin/datasets/kontext_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def build_messages(self) -> Iterator[List[Dict]]:

for item in dataset:
pil_image = item['image']
text = item['instruction']
text = item.get('instruction') or 'Describe the image'
base64_image = PIL_to_base64(pil_image, add_header=True)

message = self.create_message(text=text, image_urls=base64_image)
Expand Down
29 changes: 26 additions & 3 deletions evalscope/report/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,9 @@ def gen_html_report_file(
overall_score = rpt.score

if not rpt.metrics:
model_sections.append(dict(model_name=model, subset_rows=[], chart_div='', analysis_html=''))
model_sections.append(
dict(model_name=model, subset_rows=[], show_category=False, chart_div='', analysis_html='')
)
continue

main_metric = rpt.metrics[0]
Expand All @@ -266,13 +268,28 @@ def gen_html_report_file(
subset_scores: List[float] = []

for cat in main_metric.categories:
# Category name is stored as a tuple; join for display
cat_display = ' / '.join(cat.name) if cat.name else ''
for sub in cat.subsets:
if sub.name == ReportKey.overall_score:
continue
subset_rows.append(dict(subset=sub.name, metric=main_metric.name, score=sub.score, num=sub.num))
subset_rows.append(
dict(
subset=sub.name,
category=cat_display,
metric=main_metric.name,
score=sub.score,
num=sub.num,
)
)
subset_labels.append(sub.name)
subset_scores.append(sub.score)

# Determine whether the category column adds information
# (hide it when every row has the same category or no category)
unique_cats = {r['category'] for r in subset_rows}
show_category = len(unique_cats) > 1 or (len(unique_cats) == 1 and '' not in unique_cats)

chart_div = _subset_chart_div(
ds=ds,
model=model,
Expand All @@ -285,7 +302,13 @@ def gen_html_report_file(
analysis_raw = rpt.analysis if rpt.analysis and rpt.analysis.strip() not in ('', 'N/A') else ''
analysis_html = _md_to_html(analysis_raw) if analysis_raw else ''
model_sections.append(
dict(model_name=model, subset_rows=subset_rows, chart_div=chart_div, analysis_html=analysis_html)
dict(
model_name=model,
subset_rows=subset_rows,
show_category=show_category,
chart_div=chart_div,
analysis_html=analysis_html,
)
)

dataset_sections.append(
Expand Down
26 changes: 14 additions & 12 deletions evalscope/report/template/js/i18n_eval.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,13 @@ var I18N = {
'card.subset_scores': 'Subset Scores',

/* ── Table column headers ─────────────────── */
'col.dataset': 'Dataset',
'col.model': 'Model',
'col.metric': 'Metric',
'col.score': 'Score',
'col.num': 'Num',
'col.subset': 'Subset',
'col.dataset': 'Dataset',
'col.model': 'Model',
'col.metric': 'Metric',
'col.score': 'Score',
'col.num': 'Num',
'col.subset': 'Subset',
'col.category': 'Category',

/* ── Inline badge suffixes ────────────────── */
'badge.entries': 'entries',
Expand Down Expand Up @@ -82,12 +83,13 @@ var I18N = {
'card.subset_scores': '子集分数',

/* ── Table column headers ─────────────────── */
'col.dataset': '数据集',
'col.model': '模型',
'col.metric': '指标',
'col.score': '分数',
'col.num': '数量',
'col.subset': '子集',
'col.dataset': '数据集',
'col.model': '模型',
'col.metric': '指标',
'col.score': '分数',
'col.num': '数量',
'col.subset': '子集',
'col.category': '类别',

/* ── Inline badge suffixes ────────────────── */
'badge.entries': '条记录',
Expand Down
6 changes: 6 additions & 0 deletions evalscope/report/template/report.html.j2
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@
<table>
<thead>
<tr>
{% if msec.show_category %}
<th data-i18n="col.category">Category</th>
{% endif %}
<th data-i18n="col.subset">Subset</th>
<th data-i18n="col.metric">Metric</th>
<th data-i18n="col.score">Score</th>
Expand All @@ -167,6 +170,9 @@
<tbody>
{% for row in msec.subset_rows %}
<tr>
{% if msec.show_category %}
<td><span class="tag">{{ row.category }}</span></td>
{% endif %}
<td>{{ row.subset }}</td>
<td><span class="metric-tag">{{ row.metric }}</span></td>
<td class="score-cell">{{ "%.4f" | format(row.score) }}</td>
Expand Down
2 changes: 1 addition & 1 deletion tests/perf/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def test_run_perf_vl(self):
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
api_key=env.get('DASHSCOPE_API_KEY'),
api='openai',
dataset='flickr8k',
dataset='kontext_bench',
min_tokens=100,
max_tokens=100,
tokenizer_path='Qwen/Qwen2.5-VL-7B-Instruct',
Expand Down
Loading