CentML
diff --git a/‎multimodal/qwen3-vl/README.md‎
Lines changed: 195 additions & 0 deletions b/‎multimodal/qwen3-vl/README.md‎
Lines changed: 195 additions & 0 deletions
diff --git a/‎multimodal/qwen3-vl/scripts/slurm/submit.sh‎
Lines changed: 2 additions & 2 deletions b/‎multimodal/qwen3-vl/scripts/slurm/submit.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/benchmark.py‎
Lines changed: 43 additions & 0 deletions b/‎multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/benchmark.py‎
Lines changed: 43 additions & 0 deletions
@@ -268,6 +268,201 @@ bash submit.sh --help
     - Testing duration $\ge$ 10 mins.
     - Sample concatenation permutation is enabled.
 
+## Plugin System for `mlperf-inf-mm-q3vl benchmark`
+
+The `mlperf-inf-mm-q3vl` package supports a plugin system that allows third-party
+packages to register additional subcommands under `mlperf-inf-mm-q3vl benchmark`. This
+uses Python's standard entry points mechanism.
+
+The purpose of this feature is to allow benchmark result submitters to customize and fit
+`mlperf-inf-mm-q3vl` to the inference system that they would like to benchmark,
+**without** direct modification to the source code of `mlperf-inf-mm-q3vl` which is
+frozen after the benchmark being finalized.
+
+### How it works
+
+1. **Plugin Discovery**: When the CLI starts, it automatically discovers all registered
+plugins via the `mlperf_inf_mm_q3vl.benchmark_plugins` entry point group.
+2. **Plugin Loading**: Each plugin's entry point function is called to retrieve either a
+single command or a Typer app.
+3. **Command Registration**: The plugin's commands are automatically added to the
+`benchmark` subcommand group.
+
+### Example: creating a `mlperf-inf-mm-q3vl-foo` plugin package for `mlperf-inf-mm-q3vl benchmark foo`
+
+#### Step 1: Package Structure
+
+Create a new Python package with the following structure:
+
+```
+mlperf-inf-mm-q3vl-foo/
+├── pyproject.toml
+└── src/
+    └── mlperf_inf_mm_q3vl_foo/
+        ├── __init__.py
+        ├── schema.py
+        ├── deploy.py
+        └── plugin.py
+```
+
+Note that this is only a minimalistically illustrative example. The users are free to
+structure and name their Python packages and modules in any way that they wish. 
+
+#### Step 2: Implement the `mlperf-inf-mm-q3vl-foo` plugin
+
+Create your plugin entry point function in `plugin.py`:
+
+```python
+"""Plugin to support benchmarking the Foo inference system."""
+
+from typing import Annotated
+from collections.abc import Callable
+from loguru import logger
+from pydantic_typer import Typer
+from typer import Option
+from mlperf_inf_mm_q3vl.schema import Settings, Dataset, Endpoint, Verbosity
+from mlperf_inf_mm_q3vl.log import setup_loguru_for_benchmark
+
+from .schema import FooEndpoint
+
+def register_foo_benchmark() -> Callable:
+    """Entry point for the plugin to benchmark the Foo inference system.
+    
+    This function is called when the CLI discovers the plugin.
+    It should return either:
+    - A single command function (decorated with appropriate options)
+    - A tuple of (Typer app, command name) for more complex hierarchies
+    """
+
+    def benchmark_foo(
+        *,
+        settings: Settings,
+        dataset: Dataset,
+        # Add your foo-specific parameters here
+        foo: FooEndpoint,
+        custom_param: Annotated[
+            int,
+            Option(help="Custom parameter for foo backend"),
+        ] = 2,
+        random_seed: Annotated[
+            int,
+            Option(help="The seed for the random number generator."),
+        ] = 12345, 
+        verbosity: Annotated[
+            Verbosity,
+            Option(help="The verbosity level of the logger."),
+        ] = Verbosity.INFO,
+    ) -> None:
+        """Deploy and benchmark using Foo backend.
+        
+        This command deploys a model using the Foo backend
+        and runs the MLPerf benchmark against it.
+        """
+        from .deploy import FooDeployer
+
+        setup_loguru_for_benchmark(settings=settings, verbosity=verbosity)
+        logger.info(
+            f"Start to benchmark the Foo inference system with endpoint spec {} and custom param {}",
+            foo,
+            custom_param,
+        )
+        # Your implementation here
+        with FooDeployer(endpoint=foo, settings=settings, custom_param=custom_param):
+            # FooDeployer will make sure that Foo is deployed and currently healthy.
+            # Run benchmark using the core run_benchmark function
+            run_benchmark(
+                settings=settings,
+                dataset=dataset,
+                endpoint=vllm,
+                random_seed=random_seed,
+            )
+
+    # Return the command function
+    # The entry point name will be used as the subcommand name
+    return benchmark_foo
+```
+
+#### Step 3: Configure `pyproject.toml`
+
+Register the plugin in its package's `pyproject.toml`:
+
+```toml
+[project]
+name = "mlperf-inf-mm-q3vl-foo"
+version = "0.1.0"
+description = "Enable mlperf-inf-mm-q3vl to benchmark the Foo inference system."
+requires-python = ">=3.12"
+dependencies = [
+    "mlperf-inf-mm-q3vl @ git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl/",
+    # Add your backend-specific dependencies here
+]
+
+[project.entry-points."mlperf_inf_mm_q3vl.benchmark_plugins"]
+# The key here becomes the subcommand name.
+foo = "mlperf_inf_mm_q3vl_foo.plugin:register_foo_benchmark"
+
+[build-system]
+requires = ["setuptools>=80"]
+build-backend = "setuptools.build_meta"
+```
+
+#### Step 4: Install and use `mlperf-inf-mm-q3vl benchmark foo`
+
+```bash
+# Install your plugin package
+pip install mlperf-inf-mm-q3vl-foo
+
+# The new subcommand is now available
+mlperf-inf-mm-q3vl benchmark foo --help
+mlperf-inf-mm-q3vl benchmark foo \
+    --settings-file settings.toml \
+    --dataset shopify-global-catalogue \
+    --custom-param 3
+```
+
+#### Advanced: Nested Subcommands
+
+If you want to create multiple subcommands under a single plugin (e.g.,
+`mlperf-inf-mm-q3vl benchmark foo standard` and
+`mlperf-inf-mm-q3vl benchmark foo optimized`), return a tuple of `(Typer app, name)`:
+
+```python
+def register_foo_benchmark() -> tuple[Typer, str]:
+    """Entry point that creates nested subcommands."""
+    from pydantic_typer import Typer
+
+    # Create a Typer app for your plugin
+    foo_app = Typer(help="Benchmarking options for the Foo inference systems.")
+
+    @foo_app.command(name="standard")
+    def foo_standard(...) -> None:
+        """Run standard Foo benchmark."""
+        # Implementation
+        ...
+
+    @foo_app.command(name="optimized")
+    def foo_optimized(...) -> None:
+        """Run optimized Foo benchmark with max performance."""
+        # Implementation
+        ...
+    
+    # Return tuple of (app, command_name)
+    return (foo_app, "foo")
+```
+
+This will create:
+- `mlperf-inf-mm-q3vl benchmark foo standard`
+- `mlperf-inf-mm-q3vl benchmark foo optimized`
+
+### Best Practices
+
+1. Dependencies: Declare `mlperf-inf-mm-q3vl` as a dependency in your plugin package.
+2. Documentation: Provide clear docstrings for your plugin commands - they appear in
+`--help` output.
+3. Schema Reuse: Reuse the core `Settings`, `Dataset`, and other schemas from
+`mlperf_inf_mm_q3vl.schema` for consistency and minimizing boilerplate code.
+4. Lazy Imports: If your plugin has heavy dependencies, import them inside functions
+rather than at module level to avoid slowing down CLI startup
 
 ## Developer Guide
 
 
@@ -99,12 +99,12 @@ while [[ $# -gt 0 ]]; do
         shift
         ;;
     -seq | --server-expected-qps)
-        server_expected_qps=$2
+        server_target_qps=$2
         shift
         shift
         ;;
     -seq=* | --server-expected-qps=*)
-        server_expected_qps=${1#*=}
+        server_target_qps=${1#*=}
         shift
         ;;
     -tps | --tensor-parallel-size)
 
@@ -0,0 +1,43 @@
+"""Core benchmark execution logic for the Qwen3-VL (Q3VL) benchmark."""
+
+from __future__ import annotations
+
+import mlperf_loadgen as lg
+from loguru import logger
+
+from .schema import Dataset, Endpoint, Settings
+from .task import ShopifyGlobalCatalogue
+
+
+def run_benchmark(
+    settings: Settings,
+    dataset: Dataset,
+    endpoint: Endpoint,
+    random_seed: int,
+) -> None:
+    """Run the Qwen3-VL (Q3VL) benchmark."""
+    logger.info(
+        "Running Qwen3-VL (Q3VL) benchmark with settings: {}",
+        settings)
+    logger.info("Running Qwen3-VL (Q3VL) benchmark with dataset: {}", dataset)
+    logger.info(
+        "Running Qwen3-VL (Q3VL) benchmark with OpenAI API endpoint: {}",
+        endpoint,
+    )
+    logger.info(
+        "Running Qwen3-VL (Q3VL) benchmark with random seed: {}",
+        random_seed)
+    test_settings, log_settings = settings.to_lgtype()
+    task = ShopifyGlobalCatalogue(
+        dataset=dataset,
+        endpoint=endpoint,
+        settings=settings.test,
+        random_seed=random_seed,
+    )
+    sut = task.construct_sut()
+    qsl = task.construct_qsl()
+    logger.info("Starting the Qwen3-VL (Q3VL) benchmark with LoadGen...")
+    lg.StartTestWithLogSettings(sut, qsl, test_settings, log_settings)
+    logger.info("The Qwen3-VL (Q3VL) benchmark with LoadGen completed.")
+    lg.DestroyQSL(qsl)
+    lg.DestroySUT(sut)