Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
35c9704
Initial proposal for VLM - evaluation
johncalesp Nov 19, 2025
f5995c3
address review comments and test hiclass implementation
johncalesp Nov 20, 2025
46e4fc0
[Automated Commit] Format Codebase
github-actions[bot] Nov 20, 2025
0dc13dc
additional fixes to reviews
johncalesp Nov 23, 2025
4aa5e0d
[Automated Commit] Format Codebase
github-actions[bot] Nov 23, 2025
5e1590b
address PR comments
johncalesp Nov 24, 2025
e1ccc85
[Automated Commit] Format Codebase
github-actions[bot] Nov 24, 2025
7e0c444
add a more detail description of the field dataset.split
johncalesp Nov 24, 2025
b35b057
Enable exception logging in _query_endpoint_async
wangshangsam Nov 25, 2025
48b5bdb
[Automated Commit] Format Codebase
github-actions[bot] Nov 25, 2025
0e4c5ee
Merge branch 'master' into jcalderon/vlm-accuracy-eval
wangshangsam Nov 25, 2025
f8e1498
[Automated Commit] Format Codebase
github-actions[bot] Nov 25, 2025
f464499
Trigger CI/CD pipeline
johncalesp Nov 25, 2025
9609cd0
Merge branch 'master' into jcalderon/vlm-accuracy-eval
wangshangsam Nov 26, 2025
bc56ec9
Add performance_sample_count_override as a CLI flag.
wangshangsam Nov 26, 2025
b8e2909
Merge branch 'jcalderon/vlm-accuracy-eval' of github.com:CentML/mlper…
wangshangsam Nov 26, 2025
8b43239
[Automated Commit] Format Codebase
github-actions[bot] Nov 26, 2025
9466529
Merge branch 'master' into jcalderon/vlm-accuracy-eval
wangshangsam Nov 26, 2025
dae5065
add json format to queries
johncalesp Nov 26, 2025
c840dd6
[Automated Commit] Format Codebase
github-actions[bot] Nov 26, 2025
0b45001
added schema file and made necessary changes
johncalesp Nov 26, 2025
5f1d02c
[Automated Commit] Format Codebase
github-actions[bot] Nov 26, 2025
1849d6c
refactoring and linting
wangshangsam Nov 27, 2025
eef83eb
[Automated Commit] Format Codebase
github-actions[bot] Nov 27, 2025
dafa7f1
Add Dockerfile
wangshangsam Nov 28, 2025
ee91e7f
Add use_guided_decoding to let user choose to use guided_decoding or …
wangshangsam Nov 29, 2025
b9dd5ad
[Automated Commit] Format Codebase
github-actions[bot] Nov 29, 2025
ace336e
add f1 scores of uniform random selection
johncalesp Dec 1, 2025
60f72be
[Automated Commit] Format Codebase
github-actions[bot] Dec 1, 2025
9c7b793
Enabling mlperf-inf-mm-vl2l benchmark vllm.
wangshangsam Dec 2, 2025
443ff3d
Merge branch 'jcalderon/vlm-accuracy-eval' of github.com:CentML/mlper…
wangshangsam Dec 2, 2025
36ab421
[Automated Commit] Format Codebase
github-actions[bot] Dec 2, 2025
ea1e465
Commit to trigger the GitHub Actions in inference PR
anandhu-eng Dec 2, 2025
93a1a3e
Merge pull request #6 from anandhu-eng/patch-39
wangshangsam Dec 2, 2025
a1e6d76
empty commit
wangshangsam Dec 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions multimodal/vl2l/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ dependencies = [
"pydantic-typer @ git+https://github.com/CentML/pydantic-typer.git@wangshangsam/preserve-full-annotated-type",
"pympler",
"typer",
"scikit-learn",
"tabulate",
"hiclass",
]
dynamic = ["version"]

Expand Down
128 changes: 96 additions & 32 deletions multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,19 @@

import mlperf_loadgen as lg
from loguru import logger
from pydantic import BaseModel, DirectoryPath, Field, field_validator
from pydantic import (
BaseModel,
DirectoryPath,
Field,
FilePath,
NonNegativeInt,
field_validator,
)
from pydantic_typer import Typer
from typer import Option

from .task import ShopifyGlobalCatalogue
from .evaluation import run_evaluation
from .task import ShopifyGlobalCatalogue, Task

app = Typer()

Expand Down Expand Up @@ -144,62 +152,84 @@ class TestSettings(BaseModel):

server_target_latency: Annotated[
timedelta,
Field(description="""Expected latency constraint for Server scenario.
This is a constraint that we expect depending
on the argument server_expected_qps.
When server_expected_qps increases, we expect the latency to also increase.
When server_expected_qps decreases, we expect the latency to also decrease."""),
Field(
description="Expected latency constraint for Server scenario. "
"This is a constraint that we expect depending on the argument "
"server_expected_qps. When server_expected_qps increases, we expect the "
"latency to also increase. When server_expected_qps decreases, we expect "
"the latency to also decrease.",
),
] = timedelta(seconds=1)

server_ttft_latency: Annotated[
timedelta,
Field(description="""Time to First Token (TTFT)
latency constraint result validation"
(used when use_token_latencies is enabled)."""),
Field(
description="Time to First Token (TTFT) latency constraint result "
"validation (used when use_token_latencies is enabled).",
),
] = timedelta(seconds=1)

server_tpot_latency: Annotated[
timedelta,
Field(description="""Time per Output Token (TPOT)
latency constraint result validation"
(used when use_token_latencies is enabled)."""),
Field(
description="Time per Output Token (TPOT) latency constraint result "
"validation (used when use_token_latencies is enabled).",
),
] = timedelta(seconds=1)

min_duration: Annotated[
timedelta,
Field(
description="""The minimum testing duration
(in seconds or ISO 8601 format like PT5S).
The benchmark runs until this value has been met.""",
description="The minimum testing duration (in seconds or ISO 8601 format "
"like PT5S). The benchmark runs until this value has been met.",
),
] = timedelta(seconds=5)

min_query_count: Annotated[
int,
Field(
description="""The minimum testing query count.
The benchmark runs until this value has been met.""",
description="The minimum testing query count. The benchmark runs until this"
" value has been met. If min_query_count is less than the total number of "
"samples in the dataset, only the first min_query_count samples will be "
"used during testing.",
),
] = 100

performance_sample_count_override: Annotated[
NonNegativeInt,
Field(
description="The number of samples to use for the performance test. In the " # noqa: S608
"performance mode, the benchmark will select P random samples from the "
"dataset, then send enough queries using these P samples (and repeating "
"them if necessary) to reach the min_duration and min_query_count. If a "
"non-zero value is passed to this flag, the P will be this value. "
"Otherwise, the benchmark will estimate how many samples can be loaded into"
f" {Task.ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES} bytes of memory "
"based on the memory footprint of randomly selected "
f"{Task.MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES} samples (at most), and then"
" use this estimation as the value P.",
),
] = 0

use_token_latencies: Annotated[
bool,
Field(
description="""By default,
the Server scenario will use server_target_latency as the constraint.
When set to True, the Server scenario will use server_ttft_latency
and server_tpot_latency as the constraint.""",
description="By default, the Server scenario will use server_target_latency"
" as the constraint. When set to True, the Server scenario will use "
"server_ttft_latency and server_tpot_latency as the constraint.",
),
] = False

@field_validator("server_target_latency",
"server_ttft_latency",
"server_tpot_latency",
"min_duration",
mode="before")
@field_validator(
"server_target_latency",
"server_ttft_latency",
"server_tpot_latency",
"min_duration",
mode="before",
)
@classmethod
def parse_timedelta(cls, value: timedelta |
float | str) -> timedelta | str:
def parse_timedelta(cls, value: timedelta | float |
str) -> timedelta | str:
"""Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
if isinstance(value, timedelta):
return value
Expand All @@ -223,20 +253,25 @@ def to_lgtype(self) -> lg.TestSettings:
settings.offline_expected_qps = self.offline_expected_qps
settings.server_target_qps = self.server_expected_qps
settings.server_target_latency_ns = round(
self.server_target_latency.total_seconds() * 1e9)
self.server_target_latency.total_seconds() * 1e9,
)
settings.ttft_latency = round(
self.server_ttft_latency.total_seconds() * 1e9)
settings.tpot_latency = round(
self.server_tpot_latency.total_seconds() * 1e9)
settings.min_duration_ms = round(
self.min_duration.total_seconds() * 1000)
settings.min_query_count = self.min_query_count
settings.performance_sample_count_override = (
self.performance_sample_count_override
)
settings.use_token_latencies = self.use_token_latencies
return settings


class LogOutputSettings(BaseModel):
"""The test log output settings for the MLPerf inference LoadGen."""

outdir: Annotated[
DirectoryPath,
Field(
Expand Down Expand Up @@ -296,6 +331,7 @@ def to_lgtype(self) -> lg.LogOutputSettings:

class LogSettings(BaseModel):
"""The test log settings for the MLPerf inference LoadGen."""

log_output: Annotated[
LogOutputSettings,
Field(
Expand Down Expand Up @@ -327,6 +363,7 @@ def to_lgtype(self) -> lg.LogSettings:

class Settings(BaseModel):
"""Combine the settings for the test and logging of LoadGen."""

test: Annotated[
TestSettings,
Field(
Expand Down Expand Up @@ -374,6 +411,18 @@ class Dataset(BaseModel):
),
] = None

split: Annotated[
list[str],
Field(
description=(
"""Dataset splits to use for the benchmark. Eg: train.
You can add multiple splits by calling the same argument
multiple times. Eg:
--dataset.split test --dataset.split train"""
),
),
] = ["train", "test"]


class Verbosity(StrEnum):
"""The verbosity level of the logger."""
Expand Down Expand Up @@ -407,7 +456,22 @@ class Endpoint(BaseModel):


@app.command()
def main(
def evaluate(
filename: Annotated[
FilePath,
Option(
help="Location of the accuracy file.",
),
],
dataset: Dataset,
) -> None:
"""Evaluate the accuracy of the VLM responses."""
logger.info("Evaluating the accuracy file")
run_evaluation(filename=filename, dataset=dataset)


@app.command()
def benchmark(
*,
settings: Settings,
model: Model,
Expand Down Expand Up @@ -437,7 +501,7 @@ def main(
dataset_cli=dataset,
model_cli=model,
endpoint_cli=endpoint,
scenario=settings.test.scenario,
settings=settings.test,
random_seed=random_seed,
)
sut = task.construct_sut()
Expand Down
Loading