Fix matplotlib + shell

arekay-nv · arekay-nv · commit 9cd8039b2b24 · 2026-03-19T11:54:09.000-05:00
Signed-off-by: Rashid Kaleem &lt;230885705+arekay-nv@users.noreply.github.com&gt;
diff --git a/examples/08_Qwen2.5-0.5B_Example/QUICKSTART.md b/examples/08_Qwen2.5-0.5B_Example/QUICKSTART.md
@@ -27,4 +27,12 @@ python scripts/concurrency_sweep/summarize.py \
   results/qwen_online_benchmark/concurrency_sweep/
 ```
 
+To stream output live (useful when debugging failures):
+
+```bash
+python scripts/concurrency_sweep/run.py \
+  --config examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml \
+  --verbose
+```
+
 For manual setup, server commands, and config details, see [README.md](README.md).
diff --git a/examples/08_Qwen2.5-0.5B_Example/README.md b/examples/08_Qwen2.5-0.5B_Example/README.md
@@ -117,4 +117,6 @@ python scripts/concurrency_sweep/summarize.py \
 
 - The online sweep defaults to `1 2 4 8 16 32 64 128 256 512 1024`.
 - Use `scripts/concurrency_sweep/run.py --concurrency ... --duration-ms ...` to shorten or customize the sweep.
+- Add `--verbose` to stream benchmark output live to the terminal (useful when debugging failures).
+- If a run fails without `--verbose`, the full output is in `<sweep_root>/concurrency_<N>/benchmark.log`.
 - If vLLM runs out of memory at higher concurrency, lower `--gpu-memory-utilization`.
diff --git a/examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh b/examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh
@@ -129,13 +129,13 @@ RETRY_COUNT=0
 
 # Different ready indicators for vLLM vs SGLang
 if [[ "$SERVER_TYPE" == "vllm" ]]; then
-    READY_PATTERN="Uvicorn running\|Application startup complete"
+    READY_PATTERN="Uvicorn running|Application startup complete"
 else
-    READY_PATTERN="Uvicorn running\|Server is ready"
+    READY_PATTERN="Uvicorn running|Server is ready"
 fi
 
 while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
-    if docker logs ${CONTAINER_NAME} 2>&1 | grep -q "$READY_PATTERN"; then
+    if docker logs ${CONTAINER_NAME} 2>&1 | grep -qE "$READY_PATTERN"; then
         echo "✅ Server is ready!"
         break
     fi
diff --git a/scripts/concurrency_sweep/run.py b/scripts/concurrency_sweep/run.py
@@ -89,6 +89,14 @@ def parse_args() -> argparse.Namespace:
         default=DEFAULT_TIMEOUT_S,
         help="Per-run subprocess timeout in seconds (includes setup and teardown).",
     )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help=(
+            "Stream benchmark output to the terminal in real time in addition "
+            "to saving it to the per-run log file. Useful for debugging failures."
+        ),
+    )
     return parser.parse_args()
 
 
@@ -117,33 +125,56 @@ def render_config(
 
 
 def run_single_benchmark(
-    config: dict, timeout_seconds: int, log_path: Path
+    config: dict, timeout_seconds: int, log_path: Path, verbose: bool = False
 ) -> tuple[str, str]:
     with tempfile.NamedTemporaryFile(
         mode="w", suffix=".yaml", delete=False, dir="."
     ) as tmp:
         yaml.safe_dump(config, tmp, sort_keys=False)
         temp_config_path = Path(tmp.name)
 
+    cmd = [
+        "inference-endpoint",
+        "benchmark",
+        "from-config",
+        "-c",
+        str(temp_config_path),
+    ]
+
     try:
-        with log_path.open("w") as log_file:
-            result = subprocess.run(
-                [
-                    "inference-endpoint",
-                    "benchmark",
-                    "from-config",
-                    "-c",
-                    str(temp_config_path),
-                ],
-                stdout=log_file,
-                stderr=subprocess.STDOUT,
-                text=True,
-                timeout=timeout_seconds,
-                check=False,
-            )
-        if result.returncode == 0:
+        if verbose:
+            with log_path.open("w") as log_file:
+                proc = subprocess.Popen(
+                    cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                )
+                assert proc.stdout is not None
+                for line in proc.stdout:
+                    print(line, end="", flush=True)
+                    log_file.write(line)
+                try:
+                    proc.wait(timeout=timeout_seconds)
+                except subprocess.TimeoutExpired:
+                    proc.kill()
+                    raise
+            returncode = proc.returncode
+        else:
+            with log_path.open("w") as log_file:
+                result = subprocess.run(
+                    cmd,
+                    stdout=log_file,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    timeout=timeout_seconds,
+                    check=False,
+                )
+            returncode = result.returncode
+
+        if returncode == 0:
             return "success", ""
-        return "failed", f"exit code {result.returncode}"
+        return "failed", f"exit code {returncode}, see {log_path}"
     except subprocess.TimeoutExpired:
         return "timeout", f"exceeded {timeout_seconds} seconds"
     finally:
@@ -209,8 +240,11 @@ def main() -> int:
             config=config,
             timeout_seconds=args.timeout_seconds,
             log_path=log_path,
+            verbose=args.verbose,
         )
         print(f"  status: {status}" + (f"  ({detail})" if detail else ""))
+        if status != "success" and not args.verbose:
+            print(f"  Re-run with --verbose to stream output, or inspect: {log_path}")
 
         summary_rows.append(
             {
diff --git a/scripts/concurrency_sweep/summarize.py b/scripts/concurrency_sweep/summarize.py
@@ -84,10 +84,15 @@ def f(v: object) -> float:
 
 
 def collect_results(sweep_dir: Path) -> list[dict]:
+    def _concurrency_key(p: Path) -> int:
+        try:
+            return int(p.name.split("_")[1])
+        except (IndexError, ValueError):
+            return -1
+
     rows = []
     for concurrency_dir in sorted(
-        sweep_dir.glob("concurrency_*"),
-        key=lambda p: int(p.name.split("_")[1]),
+        sweep_dir.glob("concurrency_*"), key=_concurrency_key
     ):
         try:
             concurrency = int(concurrency_dir.name.split("_")[1])
@@ -319,7 +324,15 @@ def md_table(headers: list[str], col_keys: list[str], data: list[dict]) -> str:
 
 
 def write_plots(rows: list[dict], path: Path) -> None:
-    import matplotlib.pyplot as plt
+    try:
+        import matplotlib
+        import matplotlib.ticker
+
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+    except ImportError:
+        print("matplotlib not installed; skipping plot generation.", file=sys.stderr)
+        return
 
     successful = [r for r in rows if r["status"] == "ok"]
     if not successful:
@@ -344,7 +357,9 @@ def write_plots(rows: list[dict], path: Path) -> None:
         ax.set_ylabel(ylabel)
         ax.set_xscale("log", base=2)
         ax.set_xticks(x)
-        ax.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda v, _: str(int(v))))
+        ax.get_xaxis().set_major_formatter(
+            matplotlib.ticker.FuncFormatter(lambda v, _: str(int(v)))
+        )
         ax.legend(loc="upper left")
         ax.grid(True, which="both", linestyle="--", alpha=0.5)