mlcommons
diff --git a/‎examples/08_Qwen3-VL-235B-A22B_Example/README.md‎
Lines changed: 79 additions & 0 deletions b/‎examples/08_Qwen3-VL-235B-A22B_Example/README.md‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml‎
Lines changed: 58 additions & 0 deletions b/‎examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/inference_endpoint/async_utils/transport/zmq/transport.py‎
Lines changed: 2 additions & 1 deletion b/‎src/inference_endpoint/async_utils/transport/zmq/transport.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/inference_endpoint/cli.py‎
Lines changed: 20 additions & 0 deletions b/‎src/inference_endpoint/cli.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/inference_endpoint/commands/benchmark.py‎
Lines changed: 26 additions & 9 deletions b/‎src/inference_endpoint/commands/benchmark.py‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎src/inference_endpoint/config/schema.py‎
Lines changed: 16 additions & 0 deletions b/‎src/inference_endpoint/config/schema.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/inference_endpoint/dataset_manager/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/inference_endpoint/dataset_manager/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,79 @@
+# Running Endpoints with Qwen3-VL-235B-A22B on Shopify Product Catalogue
+
+This document describes how to perform MLPerf Q3VL benchmarking using the inference endpoints with [Qwen3-VL-235B-A22B-instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct) model and [Shopify's Product Catalogue dataset](https://huggingface.co/datasets/Shopify/product-catalogue) for multimodal product taxonomy classification.
+
+## Get Dataset
+
+The Shopify Product Catalogue dataset is loaded from HuggingFace and will be generated automatically on first run. Images are converted to base64 for storage.
+
+```
+# Dataset is auto-downloaded from https://huggingface.co/datasets/Shopify/product-catalogue
+# No manual download required - DataLoaderFactory handles it
+```
+
+## Get Model
+
+Use the public quantized MLPerf checkpoint:
+
+```
+export MODEL_NAME=Qwen/Qwen3-VL-235B-A22B-Instruct
+export HF_TOKEN=<your Hugging Face token>  # Optional for public model; may help with rate limits
+hf download $MODEL_NAME
+```
+
+The model is available at [Qwen3-VL-235B-A22B-instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct) — no access request required.
+
+**Note:** The Shopify Product Catalogue includes `ground_truth_category`, `ground_truth_brand`, and `ground_truth_is_secondhand` from the HuggingFace dataset. For accuracy evaluation, use the `shopify_category_f1` scorer which computes hierarchical F1 for category taxonomy (matches [MLCommons Q3VL evaluation](https://github.com/mlcommons/inference/blob/master/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py)).
+
+To add accuracy evaluation, include an accuracy dataset alongside the performance dataset:
+
+```yaml
+datasets:
+  - name: shopify_product_catalogue::q3vl
+    type: "performance"
+    force: true
+  - name: shopify_product_catalogue::q3vl
+    type: "accuracy"
+    force: true
+    accuracy_config:
+      eval_method: "shopify_category_f1"
+      ground_truth: "ground_truth_category"
+      extractor: "identity_extractor" # Required by benchmark; scorer parses JSON internally
+      num_repeats: 1
+```
+
+## Benchmark Qwen3-VL-235B-A22B using a config file
+
+Prepare the environment:
+
+```
+export MODEL_NAME=Qwen/Qwen3-VL-235B-A22B-Instruct
+export HF_TOKEN=<your Hugging Face token>  # Optional for public model
+export HF_HOME=<path to HuggingFace cache, e.g. ~/.cache/huggingface>
+```
+
+Launch the vLLM server (vision model requires appropriate GPU resources):
+
+```
+docker run --runtime nvidia --gpus all \
+  -p 8000:8000 \
+  --ipc=host \
+  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  --env "VLLM_HTTP_TIMEOUT_KEEP_ALIVE=3600" \
+  --env "VLLM_ENGINE_READY_TIMEOUT_S=3600" \
+  -v ${HF_HOME}:/root/.cache/huggingface \
+  vllm/vllm-openai:latest \
+  --model ${MODEL_NAME} \
+  --tensor-parallel-size 4 \
+  --max-model-len=32768 \
+  --async-scheduling \
+  --limit-mm-per-prompt.video 0
+```
+
+Run the benchmark:
+
+```
+inference-endpoint benchmark from-config -c examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml --timeout 600
+```
+
+This config uses `test_mode: "acc"` for accuracy-only (hierarchical F1). Change to `"both"` for perf+acc or `"perf"` for perf-only.
@@ -0,0 +1,58 @@
+# Offline Benchmark - Qwen3-VL-235B-A22B on Shopify Product Catalogue
+# MLPerf Inference Q3VL benchmark: multimodal product taxonomy classification
+name: "offline-qwen3-vl-235b-a22b-shopify-benchmark"
+version: "1.0"
+type: "offline"
+timeout: 14400 # Perf + acc run takes over 3 hours, consider limit n_samples_to_issue for perf run or remove accuracy dataset to skip accuracy run
+
+model_params:
+  name: "Qwen/Qwen3-VL-235B-A22B-Instruct"
+  temperature: 0
+  top_p: 1
+  max_new_tokens: 150
+
+datasets:
+  - name: shopify_product_catalogue::q3vl
+    type: "performance"
+    force: true
+  - name: shopify_product_catalogue::q3vl
+    type: "accuracy"
+    force: true
+    accuracy_config:
+      eval_method: "shopify_category_f1"
+      ground_truth: "ground_truth_category"
+      extractor: "identity_extractor"
+      num_repeats: 1
+
+settings:
+  runtime:
+    min_duration_ms: 600000 # 10 minute
+    n_samples_to_issue: 100 # Limit queries for testing (remove or increase for full run)
+    scheduler_random_seed: 42 # For Poisson/distribution sampling
+    dataloader_random_seed: 42 # For dataset shuffling
+
+  load_pattern:
+    type: "max_throughput"
+
+  client:
+    workers: 2
+    # ZMQ IPC buffers (bytes). Default 4MB; increase for large multimodal payloads (e.g. 16777216 = 16MB).
+    zmq_recv_buffer_bytes: 16777216
+    zmq_send_buffer_bytes: 16777216
+    # Cap connections to avoid overwhelming server. Default -1 uses ~25k (ephemeral ports),
+    # causing warmup failures and connection timeouts. 256-512 is typical for vLLM.
+    max_connections: 512
+    # Increase timeout for slow worker startup (spawn, imports). Default 40s may be too short.
+    worker_initialization_timeout: 120
+
+metrics:
+  collect:
+    - "throughput"
+    - "latency"
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_key: null
+
+report_dir: results/qwen3_vl_235b_a22b_shopify_benchmark_mlperf/
@@ -44,6 +44,7 @@ dependencies = [
     "transformers==4.57.1",
     "numpy==2.3.4",
     "datasets==4.1.1",
+    "Pillow==12.1.1",
     "sentencepiece==0.2.1",
     "protobuf==6.33.0",
     "openai_harmony==0.0.8",
 
@@ -92,6 +92,7 @@ class _ZMQSocketConfig:
     high_water_mark: int = 0  # 0 = unlimited
     linger: int = -1  # Block indefinitely on close to send pending messages
     immediate: int = 1  # Only enqueue on ready connections
+    # Default 4MB; increase for multimodal (VL) payloads via HTTPClientConfig / YAML / CLI.
     recv_buffer_size: int = 4 * 1024 * 1024  # 4MB
     send_buffer_size: int = 4 * 1024 * 1024  # 4MB
 
@@ -646,7 +647,7 @@ def create(
             num_workers: Number of workers (required).
             zmq_context: Managed ZMQ context (e.g. from ManagedZMQContext.scoped()).
             *args: Ignored - prevents any errors with extraneous args and adheres with WorkerPoolTransport.create().
-            **kwargs: Optional _ZMQSocketConfig overrides.
+            **kwargs: Optional _ZMQSocketConfig overrides (e.g. ``recv_buffer_size``, ``send_buffer_size``).
 
         Returns:
             Configured ZmqWorkerPoolTransport instance.
 
@@ -91,6 +91,7 @@ def create_parser() -> argparse.ArgumentParser:
         "QPS is used to calculate total queries (QPS × duration).",
     )
     _add_shared_benchmark_args(offline_parser)
+    _add_zmq_buffer_args(offline_parser)
     _add_auxiliary_args(offline_parser)
 
     # benchmark online
@@ -101,6 +102,7 @@ def create_parser() -> argparse.ArgumentParser:
     )
     _add_shared_benchmark_args(online_parser)
     _add_online_specific_args(online_parser)
+    _add_zmq_buffer_args(online_parser)
     _add_auxiliary_args(online_parser)
 
     # benchmark from-config (YAML mode)
@@ -283,6 +285,24 @@ def _add_auxiliary_args(parser):
     )
 
 
+def _add_zmq_buffer_args(parser):
+    """ZMQ IPC buffer sizes for offline/online CLI mode only (not from-config)."""
+    parser.add_argument(
+        "--zmq-recv-buffer-bytes",
+        type=int,
+        default=argparse.SUPPRESS,
+        metavar="N",
+        help="ZMQ receive buffer size in bytes (default: 4MB; offline/online only)",
+    )
+    parser.add_argument(
+        "--zmq-send-buffer-bytes",
+        type=int,
+        default=argparse.SUPPRESS,
+        metavar="N",
+        help="ZMQ send buffer size in bytes (default: 4MB; offline/online only)",
+    )
+
+
 # Argparse structure enforces arg validity - no manual validation needed
 
 
 
@@ -64,7 +64,9 @@
 from inference_endpoint.dataset_manager.dataset import Dataset
 from inference_endpoint.dataset_manager.factory import DataLoaderFactory
 from inference_endpoint.endpoint_client.config import HTTPClientConfig
-from inference_endpoint.endpoint_client.cpu_affinity import pin_loadgen
+from inference_endpoint.endpoint_client.cpu_affinity import (
+    pin_loadgen,
+)
 from inference_endpoint.endpoint_client.http_client import HTTPEndpointClient
 from inference_endpoint.endpoint_client.http_sample_issuer import HttpClientSampleIssuer
 from inference_endpoint.evaluation import Extractor
@@ -292,6 +294,16 @@ def _build_config_from_cli(
     timeout = getattr(args, "timeout", None)
     verbose_level = getattr(args, "verbose", 0)
     api_type = APIType(getattr(args, "api_type", "openai"))
+    client_kwargs: dict[str, Any] = {
+        "workers": args.workers if args.workers else -1,
+        "log_level": "DEBUG" if verbose_level >= 2 else "INFO",
+        "warmup_connections": getattr(args, "warmup_connections", -1),
+        "max_connections": getattr(args, "max_connections", None) or -1,
+    }
+    if hasattr(args, "zmq_recv_buffer_bytes"):
+        client_kwargs["zmq_recv_buffer_bytes"] = args.zmq_recv_buffer_bytes
+    if hasattr(args, "zmq_send_buffer_bytes"):
+        client_kwargs["zmq_send_buffer_bytes"] = args.zmq_send_buffer_bytes
     # Build BenchmarkConfig from CLI params
     return BenchmarkConfig(
         name=f"cli_{benchmark_mode}",
@@ -322,12 +334,7 @@ def _build_config_from_cli(
                 scheduler_random_seed=42,
                 dataloader_random_seed=42,
             ),
-            client=ClientSettings(
-                workers=args.workers if args.workers else -1,
-                log_level="DEBUG" if verbose_level >= 2 else "INFO",
-                warmup_connections=getattr(args, "warmup_connections", -1),
-                max_connections=getattr(args, "max_connections", None) or -1,
-            ),
+            client=ClientSettings(**client_kwargs),
         ),
         model_params=ModelParams(
             name=args.model,
@@ -580,6 +587,13 @@ def _run_benchmark(
         try:
             api_type: APIType = config.endpoint_config.api_type
             assert api_type is not None
+            warmup = config.settings.client.warmup_connections
+            max_conn = config.settings.client.max_connections
+            init_timeout = config.settings.client.worker_initialization_timeout
+            logger.info(
+                f"HTTP client: workers={num_workers}, warmup_connections={warmup}, "
+                f"max_connections={max_conn}, worker_init_timeout={init_timeout}s"
+            )
             http_config = HTTPClientConfig(
                 endpoint_urls=[urljoin(e, api_type.default_route()) for e in endpoints],
                 api_type=api_type,
@@ -588,9 +602,12 @@ def _run_benchmark(
                 event_logs_dir=report_dir,
                 log_level=config.settings.client.log_level,
                 cpu_affinity=affinity_plan,
-                warmup_connections=config.settings.client.warmup_connections,
-                max_connections=config.settings.client.max_connections,
+                warmup_connections=warmup,
+                max_connections=max_conn,
+                worker_initialization_timeout=init_timeout,
                 api_key=config.endpoint_config.api_key,
+                zmq_recv_buffer_bytes=config.settings.client.zmq_recv_buffer_bytes,
+                zmq_send_buffer_bytes=config.settings.client.zmq_send_buffer_bytes,
             )
             http_client = HTTPEndpointClient(http_config, zmq_context=zmq_ctx)
             sample_issuer = HttpClientSampleIssuer(http_client)
 
@@ -299,6 +299,22 @@ class ClientSettings(BaseModel):
     # -1 = unlimited (bound by system ephemeral port limit)
     max_connections: int = -1
 
+    # Seconds to wait for workers to initialize (spawn, connect, signal ready).
+    # Increase for slow systems or when workers load heavy dependencies.
+    worker_initialization_timeout: float = 40.0
+
+    # ZMQ IPC socket buffer sizes (bytes). Increase for large multimodal requests.
+    zmq_recv_buffer_bytes: int = Field(
+        default=4 * 1024 * 1024,
+        ge=1,
+        description="ZMQ receive buffer size in bytes (default 4MB).",
+    )
+    zmq_send_buffer_bytes: int = Field(
+        default=4 * 1024 * 1024,
+        ge=1,
+        description="ZMQ send buffer size in bytes (default 4MB).",
+    )
+
 
 class Settings(BaseModel):
     """Test settings (can be overridden by CLI)."""
 
@@ -27,6 +27,7 @@
 from .predefined.livecodebench import LiveCodeBench
 from .predefined.open_orca import OpenOrca
 from .predefined.random import RandomDataset
+from .predefined.shopify_product_catalogue import ShopifyProductCatalogue
 from .transforms import (
     AddStaticColumns,
     ColumnFilter,
@@ -56,4 +57,5 @@
     "LiveCodeBench",
     "CNNDailyMail",
     "RandomDataset",
+    "ShopifyProductCatalogue",
 ]