YanickSchraner
diff --git a/‎benchmarks/qwen3-omni/README.md‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/qwen3-omni/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/api/README.md‎
Lines changed: 5 additions & 3 deletions b/‎docs/api/README.md‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎docs/contributing/metrics.md‎
Lines changed: 165 additions & 0 deletions b/‎docs/contributing/metrics.md‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎examples/offline_inference/bagel/end2end.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/offline_inference/bagel/end2end.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/offline_inference/qwen2_5_omni/end2end.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/offline_inference/qwen2_5_omni/end2end.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/offline_inference/qwen3_omni/end2end.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/offline_inference/qwen3_omni/end2end.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/offline_inference/qwen3_tts/end2end.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/offline_inference/qwen3_tts/end2end.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎requirements/common.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements/common.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/e2e/online_serving/test_async_omni.py‎
Lines changed: 1 addition & 52 deletions b/‎tests/e2e/online_serving/test_async_omni.py‎
Lines changed: 1 addition & 52 deletions
@@ -53,7 +53,7 @@ bash benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh
 ```
 
 What it does:
-- Runs `examples/offline_inference/qwen3_omni/end2end.py` with `--enable-stats`.
+- Runs `examples/offline_inference/qwen3_omni/end2end.py` with `--log-stats`.
 - Uses `benchmarks/build_dataset/top100.txt` and writes to:
   - Logs: `benchmarks/qwen3-omni/vllm_omni/logs/`
     - `omni_llm_pipeline_text.orchestrator.stats.jsonl` — per-stage latency stats.
 
@@ -23,7 +23,7 @@ else
     python $end2end_script_path --output-wav $outputs_dir \
                       --query-type text \
                       --txt-prompts $build_dataset_path \
-                      --enable-stats \
+                      --log-stats \
                       --log-dir $log_dir
     echo "Logs and outputs are saved in ${log_dir} and ${outputs_dir} respectively:"
     echo "  - omni_llm_pipeline_text                       run dir/base name"
 
@@ -15,9 +15,6 @@ Main entry points for vLLM-Omni inference and serving.
 - [vllm_omni.entrypoints.cli.benchmark.serve.OmniBenchmarkServingSubcommand][]
 - [vllm_omni.entrypoints.cli.serve.OmniServeCommand][]
 - [vllm_omni.entrypoints.client_request_state.ClientRequestState][]
-- [vllm_omni.entrypoints.log_utils.OrchestratorMetrics][]
-- [vllm_omni.entrypoints.log_utils.StageRequestMetrics][]
-- [vllm_omni.entrypoints.log_utils.StageStats][]
 - [vllm_omni.entrypoints.omni.Omni][]
 - [vllm_omni.entrypoints.omni.OmniBase][]
 - [vllm_omni.entrypoints.omni_diffusion.OmniDiffusion][]
@@ -114,3 +111,8 @@ Worker classes and model runners for distributed inference.
 - [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][]
 - [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][]
 - [vllm_omni.worker.mixins.OmniWorkerMixin][]
+
+
+## Metrics
+
+- [vllm_omni.metrics.OrchestratorAggregator][]
@@ -0,0 +1,165 @@
+
+# Metrics vLLM-Omni:
+
+You can use these metrics in production to monitor the health and performance of the vLLM-omni system. Typical scenarios include:
+- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `e2e_total_ms`), and resource utilization to verify that the system meets expected standards.
+- **Debugging and Troubleshooting**: Use detailed per-request metrics to diagnose issues, such as high transfer times or unexpected token counts.
+
+## How to Enable and View Metrics
+
+### 1. Start the Service with Metrics Logging
+
+```bash
+vllm serve /workspace/models/Qwen3-Omni-30B-A3B-Instruct --omni --port 8014 --log-stats
+```
+
+### 2. Send a Request
+
+```bash
+python openai_chat_completion_client_for_multimodal_generation.py --query-type use_image
+```
+
+### 3. What You Will See
+
+With `--log-stats` enabled, the server will output detailed metrics logs after each request. Example output:
+
+
+#### Overall Summary
+
+| Field                       | Value        |
+|-----------------------------|--------------|
+| e2e_requests                | 1            |
+| e2e_wall_time_ms            | 41,299.190   |
+| e2e_total_tokens            | 5,202        |
+| e2e_avg_time_per_request_ms | 41,299.190   |
+| e2e_avg_tokens_per_s        | 125.959      |
+| e2e_stage_0_wall_time_ms    | 10,192.289   |
+| e2e_stage_1_wall_time_ms    | 30,541.409   |
+| e2e_stage_2_wall_time_ms    |    207.496   |
+
+#### RequestE2EStats
+
+| Field                   | Value      |
+|-------------------------|------------|
+| e2e_total_ms            | 41,299.133 |
+| e2e_total_tokens        | 5,202      |
+| transfers_total_time_ms | 245.895    |
+| transfers_total_kbytes  | 138,089.939|
+
+#### StageRequestStats
+
+| Field                  | 0      | 1      | 2      |
+|------------------------|--------|--------|--------|
+| audio_generated_frames | 0      | 0      | 525,525|
+| batch_id               | 38     | 274    | 0      |
+| batch_size             | 1      | 1      | 1      |
+| num_tokens_in          | 4,860  | 4,826  | 4,384  |
+| num_tokens_out         | 67     | 275    | 0      |
+| postprocess_time_ms    | 256.158| 0.491  | 0.000  |
+| stage_gen_time_ms      | 9,910.007|30,379.198|160.745|
+
+#### TransferEdgeStats
+
+| Field               | 0->1        | 1->2       |
+|---------------------|-------------|------------|
+| size_kbytes         | 109,277.349 | 28,812.591 |
+| tx_time_ms          | 78.701      | 18.790     |
+| rx_decode_time_ms   | 111.865     | 31.706     |
+| in_flight_time_ms   | 2.015       | 2.819      |
+
+
+These logs include:
+- **Overall summary**: total requests, wall time, average tokens/sec, etc.
+- **E2E table**: per-request latency and token counts.
+- **Stage table**: per-stage batch and timing details.
+- **Transfer table**: data transfer and timing for each edge.
+
+You can use these logs to monitor system health, debug performance, and analyze request-level metrics as described above.
+
+
+## Metrics Scope: Offline vs Online Inference
+
+For **offline inference** (batch mode), the summary includes both system-level metrics (aggregated across all requests) and per-request metrics. In this case, `e2e_requests` can be greater than 1, reflecting multiple completed requests in a batch.
+
+For **online inference** (serving mode), the summary is always per-request. `e2e_requests` is always 1, and only request-level metrics are reported for each completion.
+
+---
+
+## Parameter Details
+
+| Field                     | Meaning                                                                                       |
+|---------------------------|----------------------------------------------------------------------------------------------|
+| `e2e_requests`            | Number of completed requests.                                                                |
+| `e2e_wall_time_ms`        | Wall-clock time span from run start to last completion, in ms.                               |
+| `e2e_total_tokens`        | Total tokens counted across all completed requests (stage0 input + all stage outputs).       |
+| `e2e_avg_time_per_request_ms` | Average wall time per request: `e2e_wall_time_ms / e2e_requests`.                        |
+| `e2e_avg_tokens_per_s`    | Average token throughput over wall time: `e2e_total_tokens * 1000 / e2e_wall_time_ms`.      |
+| `e2e_stage_{i}_wall_time_ms` | Wall-clock time span for stage i, in ms. Each stage's wall time is reported as a separate field, e.g., `e2e_stage_0_wall_time_ms`, `e2e_stage_1_wall_time_ms`, etc. |
+
+---
+
+## E2E Table (per request)
+
+| Field                     | Meaning                                                               |
+|---------------------------|-----------------------------------------------------------------------|
+| `e2e_total_ms`            | End-to-end latency in ms.                                             |
+| `e2e_total_tokens`        | Total tokens for the request (stage0 input + all stage outputs).      |
+| `transfers_total_time_ms` | Sum of transfer edge `total_time_ms` for this request.                |
+| `transfers_total_kbytes`  | Sum of transfer kbytes for this request.                              |
+
+
+---
+
+## Stage Table (per stage event / request)
+
+| Field                     | Meaning                                                                                         |
+|---------------------------|-------------------------------------------------------------------------------------------------|
+| `batch_id`                | Batch index.                                                                                    |
+| `batch_size`              | Batch size.                                                                                     |
+| `num_tokens_in`           | Input tokens to the stage.                                                                      |
+| `num_tokens_out`          | Output tokens from the stage.                                                                   |
+| `stage_gen_time_ms`       | Stage compute time in ms, excluding postprocessing time (reported separately as `postprocess_time_ms`). |
+| `image_num`               | Number of images generated (for diffusion/image stages).                                        |
+| `resolution`              | Image resolution (for diffusion/image stages).                                                                  |
+| `postprocess_time_ms` | Diffusion/image: post-processing time in ms.                                                    |
+
+---
+
+## Transfer Table (per edge / request)
+
+| Field                | Meaning                                                                   |
+|----------------------|---------------------------------------------------------------------------|
+| `size_kbytes`        | Total kbytes transferred.                                                 |
+| `tx_time_ms`         | Sender transfer time in ms.                                               |
+| `rx_decode_time_ms`  | Receiver decode time in ms.                                               |
+| `in_flight_time_ms`  | In-flight time in ms.                                                     |
+
+
+## Expectation of the Numbers (Verification)
+
+**Formulas:**
+- `e2e_total_tokens = Stage0's num_tokens_in + sum(all stages' num_tokens_out)`
+- `transfers_total_time_ms = sum(tx_time_ms + rx_decode_time_ms + in_flight_time_ms)` for every edge
+
+**Using the example above:**
+
+### e2e_total_tokens
+- Stage0's `num_tokens_in`: **4,860**
+- Stage0's `num_tokens_out`: **67**
+- Stage1's `num_tokens_out`: **275**
+- Stage2's `num_tokens_out`: **0**
+
+So,
+```
+e2e_total_tokens = 4,860 + 67 + 275 + 0 = 5,202
+```
+This matches the table value: `e2e_total_tokens = 5,202`.
+
+### transfers_total_time_ms
+For each edge:
+- 0->1: tx_time_ms (**78.701**) + rx_decode_time_ms (**111.865**) + in_flight_time_ms (**2.015**) = **192.581**
+- 1->2: tx_time_ms (**18.790**) + rx_decode_time_ms (**31.706**) + in_flight_time_ms (**2.819**) = **53.315**
+
+Sum: 192.581 + 53.315 = **245.896**
+
+The table shows `transfers_total_time_ms = 245.895`, which matches the calculation (difference is due to rounding).
@@ -36,7 +36,7 @@ def parse_args():
     )
 
     # OmniLLM init args
-    parser.add_argument("--enable-stats", action="store_true", default=False)
+    parser.add_argument("--log-stats", action="store_true", default=False)
     parser.add_argument("--init-sleep-seconds", type=int, default=20)
     parser.add_argument("--batch-timeout", type=int, default=5)
     parser.add_argument("--init-timeout", type=int, default=300)
@@ -120,7 +120,7 @@ def main():
 
         omni_kwargs.update(
             {
-                "log_stats": args.enable_stats,
+                "log_stats": args.log_stats,
                 "init_sleep_seconds": args.init_sleep_seconds,
                 "batch_timeout": args.batch_timeout,
                 "init_timeout": args.init_timeout,
 
@@ -322,7 +322,7 @@ def main(args):
         query_result = query_func()
     omni_llm = Omni(
         model=model_name,
-        log_stats=args.enable_stats,
+        log_stats=args.log_stats,
         stage_init_timeout=args.stage_init_timeout,
         batch_timeout=args.batch_timeout,
         init_timeout=args.init_timeout,
@@ -439,7 +439,7 @@ def parse_args():
         help="Query type.",
     )
     parser.add_argument(
-        "--enable-stats",
+        "--log-stats",
         action="store_true",
         default=False,
         help="Enable writing detailed statistics (default: disabled)",
 
@@ -328,7 +328,7 @@ def main(args):
     omni_llm = Omni(
         model=model_name,
         stage_configs_path=args.stage_configs_path,
-        log_stats=args.enable_stats,
+        log_stats=args.log_stats,
         stage_init_timeout=args.stage_init_timeout,
     )
 
@@ -455,7 +455,7 @@ def parse_args():
         help="Query type.",
     )
     parser.add_argument(
-        "--enable-stats",
+        "--log-stats",
         action="store_true",
         default=False,
         help="Enable writing detailed statistics (default: disabled)",
 
@@ -219,7 +219,7 @@ def main(args):
     omni = Omni(
         model=model_name,
         stage_configs_path=args.stage_configs_path,
-        log_stats=args.enable_stats,
+        log_stats=args.log_stats,
         stage_init_timeout=args.stage_init_timeout,
     )
 
@@ -275,7 +275,7 @@ def parse_args():
         help="Query type.",
     )
     parser.add_argument(
-        "--enable-stats",
+        "--log-stats",
         action="store_true",
         default=False,
         help="Enable writing detailed statistics (default: disabled)",
 
@@ -12,3 +12,4 @@ torchsde>=0.2.6
 openai-whisper>=20250625
 imageio[ffmpeg]>=2.37.2
 sox>=1.5.0
+prettytable>=3.8.0
@@ -9,7 +9,7 @@
 from vllm.inputs import PromptType
 
 from tests.utils import hardware_test
-from vllm_omni.entrypoints.async_omni import AsyncOmni, ClientRequestState
+from vllm_omni.entrypoints.async_omni import AsyncOmni
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
@@ -115,54 +115,3 @@ async def test_abort():
         num_generated_tokens, request_id = await task
         assert num_generated_tokens == NUM_EXPECTED_TOKENS
     await asyncio.sleep(5)
-
-
-@pytest.mark.core_model
-@pytest.mark.omni
-@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
-@pytest.mark.asyncio
-async def test_build_and_log_summary(monkeypatch):
-    from vllm_omni.entrypoints.utils import get_final_stage_id_for_e2e
-
-    RealCRS = ClientRequestState
-    capture_metrics = {}
-
-    class MockCRS(RealCRS):
-        def __init__(self, request_id: str):
-            super().__init__(request_id)
-            capture_metrics[request_id] = self
-
-    monkeypatch.setattr("vllm_omni.entrypoints.async_omni.ClientRequestState", MockCRS)
-    monkeypatch.setattr("vllm_omni.entrypoints.client_request_state.ClientRequestState", MockCRS)
-
-    with ExitStack() as after:
-        # Avoid SHM IPC in tests to prevent /dev/shm exhaustion and SIGBUS.
-        engine = AsyncOmni(
-            model=model,
-            stage_configs_path=stage_config,
-            shm_threshold_bytes=sys.maxsize,
-        )
-        after.callback(engine.shutdown)
-        prompt = "Hello my name is Robert and "
-        NUM_EXPECTED_TOKENS = 64
-        NUM_REQUESTS = 3
-        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
-
-        # Create concurrent requests.
-        tasks: list[asyncio.Task] = []
-        for idx, request_id in enumerate(request_ids):
-            tasks.append(asyncio.create_task(generate(engine, request_id, prompt, NUM_EXPECTED_TOKENS)))
-
-        # Confirm the requests are okay.
-        for idx, task in enumerate(tasks):
-            await task
-            output_modalities = ["text"]
-            final_stage_id_for_e2e = get_final_stage_id_for_e2e(
-                output_modalities, engine.output_modalities, engine.stage_list
-            )
-            summary = capture_metrics[request_ids[idx]].metrics.build_and_log_summary(final_stage_id_for_e2e)
-
-            # Check that total tokens matches sum of stage tokens.
-            assert summary["e2e_total_tokens"] == sum(stage["tokens"] for stage in summary["stages"])
-            # Check that total time matches sum of stage times.
-            assert summary["e2e_total_time_ms"] >= sum(stage["total_time_ms"] for stage in summary["stages"])