Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
35c9704
Initial proposal for VLM - evaluation
johncalesp Nov 19, 2025
f5995c3
address review comments and test hiclass implementation
johncalesp Nov 20, 2025
46e4fc0
[Automated Commit] Format Codebase
github-actions[bot] Nov 20, 2025
0dc13dc
additional fixes to reviews
johncalesp Nov 23, 2025
4aa5e0d
[Automated Commit] Format Codebase
github-actions[bot] Nov 23, 2025
5e1590b
address PR comments
johncalesp Nov 24, 2025
e1ccc85
[Automated Commit] Format Codebase
github-actions[bot] Nov 24, 2025
7e0c444
add a more detail description of the field dataset.split
johncalesp Nov 24, 2025
b35b057
Enable exception logging in _query_endpoint_async
wangshangsam Nov 25, 2025
48b5bdb
[Automated Commit] Format Codebase
github-actions[bot] Nov 25, 2025
0e4c5ee
Merge branch 'master' into jcalderon/vlm-accuracy-eval
wangshangsam Nov 25, 2025
f8e1498
[Automated Commit] Format Codebase
github-actions[bot] Nov 25, 2025
f464499
Trigger CI/CD pipeline
johncalesp Nov 25, 2025
9609cd0
Merge branch 'master' into jcalderon/vlm-accuracy-eval
wangshangsam Nov 26, 2025
bc56ec9
Add performance_sample_count_override as a CLI flag.
wangshangsam Nov 26, 2025
b8e2909
Merge branch 'jcalderon/vlm-accuracy-eval' of github.com:CentML/mlper…
wangshangsam Nov 26, 2025
8b43239
[Automated Commit] Format Codebase
github-actions[bot] Nov 26, 2025
9466529
Merge branch 'master' into jcalderon/vlm-accuracy-eval
wangshangsam Nov 26, 2025
dae5065
add json format to queries
johncalesp Nov 26, 2025
c840dd6
[Automated Commit] Format Codebase
github-actions[bot] Nov 26, 2025
0b45001
added schema file and made necessary changes
johncalesp Nov 26, 2025
5f1d02c
[Automated Commit] Format Codebase
github-actions[bot] Nov 26, 2025
1849d6c
refactoring and linting
wangshangsam Nov 27, 2025
eef83eb
[Automated Commit] Format Codebase
github-actions[bot] Nov 27, 2025
dafa7f1
Add Dockerfile
wangshangsam Nov 28, 2025
ee91e7f
Add use_guided_decoding to let user choose to use guided_decoding or …
wangshangsam Nov 29, 2025
b9dd5ad
[Automated Commit] Format Codebase
github-actions[bot] Nov 29, 2025
ace336e
add f1 scores of uniform random selection
johncalesp Dec 1, 2025
60f72be
[Automated Commit] Format Codebase
github-actions[bot] Dec 1, 2025
9c7b793
Enabling mlperf-inf-mm-vl2l benchmark vllm.
wangshangsam Dec 2, 2025
443ff3d
Merge branch 'jcalderon/vlm-accuracy-eval' of github.com:CentML/mlper…
wangshangsam Dec 2, 2025
36ab421
[Automated Commit] Format Codebase
github-actions[bot] Dec 2, 2025
ea1e465
Commit to trigger the GitHub Actions in inference PR
anandhu-eng Dec 2, 2025
93a1a3e
Merge pull request #6 from anandhu-eng/patch-39
wangshangsam Dec 2, 2025
a1e6d76
empty commit
wangshangsam Dec 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions multimodal/vl2l/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ dependencies = [
"pydantic-typer @ git+https://github.com/CentML/pydantic-typer.git@wangshangsam/preserve-full-annotated-type",
"pympler",
"typer",
"scikit-learn",
"tabulate",
"hiclass",
]
dynamic = ["version"]

Expand Down
38 changes: 34 additions & 4 deletions multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@

import mlperf_loadgen as lg
from loguru import logger
from pydantic import BaseModel, DirectoryPath, Field, field_validator
from pydantic import BaseModel, DirectoryPath, Field, FilePath, field_validator
from pydantic_typer import Typer
from typer import Option

from .evaluation import run_evaluation
from .task import ShopifyGlobalCatalogue

app = Typer()
Expand Down Expand Up @@ -178,7 +179,9 @@ class TestSettings(BaseModel):
int,
Field(
description="""The minimum testing query count.
The benchmark runs until this value has been met.""",
The benchmark runs until this value has been met.
if min_query_count is less than the total number of samples in the dataset,
only the first min_query_count samples will be used during testing.""",
),
] = 100

Expand Down Expand Up @@ -374,6 +377,18 @@ class Dataset(BaseModel):
),
] = None

split: Annotated[
list[str],
Field(
description=(
"""Dataset splits to use for the benchmark. Eg: train.
You can add multiple splits by calling the same argument
multiple times. Eg:
--dataset.split test --dataset.split train"""
),
),
] = ["train", "test"]


class Verbosity(StrEnum):
"""The verbosity level of the logger."""
Expand Down Expand Up @@ -407,7 +422,22 @@ class Endpoint(BaseModel):


@app.command()
def main(
def evaluate(
filename: Annotated[
FilePath,
Option(
help="Location of the accuracy file.",
),
],
dataset: Dataset,
) -> None:
"""Evaluate the accuracy of the VLM responses."""
logger.info("Evaluating the accuracy file")
run_evaluation(filename=filename, dataset=dataset)


@app.command()
def benchmark(
*,
settings: Settings,
model: Model,
Expand Down Expand Up @@ -437,7 +467,7 @@ def main(
dataset_cli=dataset,
model_cli=model,
endpoint_cli=endpoint,
scenario=settings.test.scenario,
settings=settings.test,
random_seed=random_seed,
)
sut = task.construct_sut()
Expand Down
208 changes: 208 additions & 0 deletions multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
"""Task definitions for the VL2L benchmark."""

from __future__ import annotations

import json
from pathlib import Path
from typing import TYPE_CHECKING

import numpy as np
from datasets import load_dataset
from hiclass.metrics import f1
from loguru import logger
from sklearn.metrics import f1_score
from tabulate import tabulate

if TYPE_CHECKING:
from pydantic import FilePath

from .cli import Dataset as DatasetCLI


def get_hierarchical_components(predicted_path: str,
true_path: str,
separator: str = " > ") -> tuple[int, int, int]:
"""Calculates the components for Hierarchical Precision.

Args:
predicted_path: Categories predicted by the VLM.
true_path: Ground truth categories.
separator: String used to separate each category.

Returns:
Tuple of number of intersections,
correctly predicted categories and
ground truth categories.
"""
# 1. Split the paths into categories (nodes)
predicted_categories = [c.strip() for c in predicted_path.split(separator)]
true_categories = [c.strip() for c in true_path.split(separator)]

# Check for empty paths
if not predicted_categories or not true_categories:
return 0, len(predicted_categories), len(true_categories)

# 2. Count the intersection (longest common prefix)
intersection_count = 0

# Iterate through the paths simultaneously
for pred_cat, true_cat in zip(predicted_categories,
true_categories,
strict=False):
if pred_cat == true_cat:
intersection_count += 1
else:
# Stop as soon as a mismatch is found (enforces hierarchical match)
break

pred_length = len(predicted_categories)
true_length = len(true_categories)

return intersection_count, pred_length, true_length


def calculate_hierarchical_f1(data: list[tuple[str, str]]) -> float:
"""Calculates the aggregate hF scores for a list of samples.

Args:
data: A list of tuples, where each tuple is
(predicted_path_str, true_path_str).

Returns:
F1 score
"""
total_intersection = 0
total_predicted_length = 0
total_true_length = 0

# 1. Aggregate the components across all samples
for pred_path, true_path in data:
intersection, pred_len, true_len = \
get_hierarchical_components(pred_path, true_path)

total_intersection += intersection
total_predicted_length += pred_len
total_true_length += true_len

# 2. Calculate hP and hR
hp = total_intersection / total_predicted_length \
if total_predicted_length > 0 else 0.0
hr = total_intersection / total_true_length \
if total_true_length > 0 else 0.0

return 0.0 if hp + hr == 0 else 2 * (hp * hr) / (hp + hr)


def calculate_exact_match(generated_text: str, original_text: str) -> float:
"""Calculates binary Exact Match (EM) score.

We clean the text (lowercase, strip whitespace) for a fairer comparison.

Args:
generated_text: Output from the VLM.
original_text: Ground truth information from the dataset.

Returns:
1 if the values match or 0 otherwise
"""
gen = generated_text.strip().lower()
orig = original_text.strip().lower()

return 1.0 if gen == orig else 0.0


def calculate_secondhand_f1(data: list[tuple[str, str]]) -> float:
"""Calculate F1 score of is_secondhand field.

Args:
data: List of tuples of predicted and true values
Returs:
f1 score
"""
y_pred = []
y_src = []
for pred, src in data:
y_pred.append(pred)
y_src.append(src)

return f1_score(y_src, y_pred)


def calculate_hiclass_f1(data: list[tuple[str, str]]) -> float:
"""Alt method to calculate hierarchical F1.

Args:
data: List of tuples of predicted and true values
Returs:
f1 score
"""
y_pred_raw = []
y_true_raw = []

for pred, src in data:
path1 = pred.split(" > ")
path2 = src.split(" > ")

y_pred_raw.append(path1)
y_true_raw.append(path2)

# 2. Find the global maximum length across ALL samples
# We check the longest path in both true and pred lists
max_len = max(len(p) for p in y_true_raw + y_pred_raw)

# 3. Pad all lists to the global max_len
for i in range(len(y_true_raw)):
# Pad Truth
pad_len_true = max_len - len(y_true_raw[i])
y_true_raw[i] += [""] * pad_len_true

# Pad Prediction
pad_len_pred = max_len - len(y_pred_raw[i])
y_pred_raw[i] += [""] * pad_len_pred

# 4. Convert to numpy arrays
y_true = np.array(y_true_raw)
y_pred = np.array(y_pred_raw)

# 5. Calculate Score
return f1(y_true, y_pred)


def run_evaluation(filename: FilePath, dataset: DatasetCLI) -> None:
"""Main function to run the evaluation."""
with Path.open(filename) as f:
model_output = json.load(f)

original_data = load_dataset(
dataset.repo_id,
dataset.token,
split="+".join(dataset.split),
)

category_dataset_pred_src = []
is_secondhand_pred_src = []
for elem in model_output:
byte_data = bytes.fromhex(elem["data"])
idx = elem["qsl_idx"]
pred_text_decode = byte_data.decode("utf-8")
pred_item = json.loads(pred_text_decode)
ground_truth_item = original_data[idx]
category_dataset_pred_src.append((pred_item["category"],
ground_truth_item["ground_truth_category"]))
is_secondhand_pred_src.append((int(pred_item["is_secondhand"]),
int(ground_truth_item["ground_truth_is_secondhand"])))

category_f1_score = calculate_hierarchical_f1(
category_dataset_pred_src)
hiclass_f1 = calculate_hiclass_f1(category_dataset_pred_src)
is_secondhand_f1_score = calculate_secondhand_f1(is_secondhand_pred_src)

data = [
["category", category_f1_score, hiclass_f1],
["is_secondhand", is_secondhand_f1_score],
]

logger.info("Results:\n{}", tabulate(data,
headers=["Fields", "F1 Score",
"HiClass F1 Score"],
tablefmt="fancy_grid"))
Loading