Skip to content

Commit 9cd8039

Browse files
committed
Fix matplotlib + shell
Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
1 parent 2fd76c4 commit 9cd8039

File tree

5 files changed

+84
-25
lines changed

5 files changed

+84
-25
lines changed

examples/08_Qwen2.5-0.5B_Example/QUICKSTART.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,12 @@ python scripts/concurrency_sweep/summarize.py \
2727
results/qwen_online_benchmark/concurrency_sweep/
2828
```
2929

30+
To stream output live (useful when debugging failures):
31+
32+
```bash
33+
python scripts/concurrency_sweep/run.py \
34+
--config examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml \
35+
--verbose
36+
```
37+
3038
For manual setup, server commands, and config details, see [README.md](README.md).

examples/08_Qwen2.5-0.5B_Example/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,6 @@ python scripts/concurrency_sweep/summarize.py \
117117

118118
- The online sweep defaults to `1 2 4 8 16 32 64 128 256 512 1024`.
119119
- Use `scripts/concurrency_sweep/run.py --concurrency ... --duration-ms ...` to shorten or customize the sweep.
120+
- Add `--verbose` to stream benchmark output live to the terminal (useful when debugging failures).
121+
- If a run fails without `--verbose`, the full output is in `<sweep_root>/concurrency_<N>/benchmark.log`.
120122
- If vLLM runs out of memory at higher concurrency, lower `--gpu-memory-utilization`.

examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,13 @@ RETRY_COUNT=0
129129

130130
# Different ready indicators for vLLM vs SGLang
131131
if [[ "$SERVER_TYPE" == "vllm" ]]; then
132-
READY_PATTERN="Uvicorn running\|Application startup complete"
132+
READY_PATTERN="Uvicorn running|Application startup complete"
133133
else
134-
READY_PATTERN="Uvicorn running\|Server is ready"
134+
READY_PATTERN="Uvicorn running|Server is ready"
135135
fi
136136

137137
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
138-
if docker logs ${CONTAINER_NAME} 2>&1 | grep -q "$READY_PATTERN"; then
138+
if docker logs ${CONTAINER_NAME} 2>&1 | grep -qE "$READY_PATTERN"; then
139139
echo "✅ Server is ready!"
140140
break
141141
fi

scripts/concurrency_sweep/run.py

Lines changed: 52 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,14 @@ def parse_args() -> argparse.Namespace:
8989
default=DEFAULT_TIMEOUT_S,
9090
help="Per-run subprocess timeout in seconds (includes setup and teardown).",
9191
)
92+
parser.add_argument(
93+
"--verbose",
94+
action="store_true",
95+
help=(
96+
"Stream benchmark output to the terminal in real time in addition "
97+
"to saving it to the per-run log file. Useful for debugging failures."
98+
),
99+
)
92100
return parser.parse_args()
93101

94102

@@ -117,33 +125,56 @@ def render_config(
117125

118126

119127
def run_single_benchmark(
120-
config: dict, timeout_seconds: int, log_path: Path
128+
config: dict, timeout_seconds: int, log_path: Path, verbose: bool = False
121129
) -> tuple[str, str]:
122130
with tempfile.NamedTemporaryFile(
123131
mode="w", suffix=".yaml", delete=False, dir="."
124132
) as tmp:
125133
yaml.safe_dump(config, tmp, sort_keys=False)
126134
temp_config_path = Path(tmp.name)
127135

136+
cmd = [
137+
"inference-endpoint",
138+
"benchmark",
139+
"from-config",
140+
"-c",
141+
str(temp_config_path),
142+
]
143+
128144
try:
129-
with log_path.open("w") as log_file:
130-
result = subprocess.run(
131-
[
132-
"inference-endpoint",
133-
"benchmark",
134-
"from-config",
135-
"-c",
136-
str(temp_config_path),
137-
],
138-
stdout=log_file,
139-
stderr=subprocess.STDOUT,
140-
text=True,
141-
timeout=timeout_seconds,
142-
check=False,
143-
)
144-
if result.returncode == 0:
145+
if verbose:
146+
with log_path.open("w") as log_file:
147+
proc = subprocess.Popen(
148+
cmd,
149+
stdout=subprocess.PIPE,
150+
stderr=subprocess.STDOUT,
151+
text=True,
152+
)
153+
assert proc.stdout is not None
154+
for line in proc.stdout:
155+
print(line, end="", flush=True)
156+
log_file.write(line)
157+
try:
158+
proc.wait(timeout=timeout_seconds)
159+
except subprocess.TimeoutExpired:
160+
proc.kill()
161+
raise
162+
returncode = proc.returncode
163+
else:
164+
with log_path.open("w") as log_file:
165+
result = subprocess.run(
166+
cmd,
167+
stdout=log_file,
168+
stderr=subprocess.STDOUT,
169+
text=True,
170+
timeout=timeout_seconds,
171+
check=False,
172+
)
173+
returncode = result.returncode
174+
175+
if returncode == 0:
145176
return "success", ""
146-
return "failed", f"exit code {result.returncode}"
177+
return "failed", f"exit code {returncode}, see {log_path}"
147178
except subprocess.TimeoutExpired:
148179
return "timeout", f"exceeded {timeout_seconds} seconds"
149180
finally:
@@ -209,8 +240,11 @@ def main() -> int:
209240
config=config,
210241
timeout_seconds=args.timeout_seconds,
211242
log_path=log_path,
243+
verbose=args.verbose,
212244
)
213245
print(f" status: {status}" + (f" ({detail})" if detail else ""))
246+
if status != "success" and not args.verbose:
247+
print(f" Re-run with --verbose to stream output, or inspect: {log_path}")
214248

215249
summary_rows.append(
216250
{

scripts/concurrency_sweep/summarize.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,15 @@ def f(v: object) -> float:
8484

8585

8686
def collect_results(sweep_dir: Path) -> list[dict]:
87+
def _concurrency_key(p: Path) -> int:
88+
try:
89+
return int(p.name.split("_")[1])
90+
except (IndexError, ValueError):
91+
return -1
92+
8793
rows = []
8894
for concurrency_dir in sorted(
89-
sweep_dir.glob("concurrency_*"),
90-
key=lambda p: int(p.name.split("_")[1]),
95+
sweep_dir.glob("concurrency_*"), key=_concurrency_key
9196
):
9297
try:
9398
concurrency = int(concurrency_dir.name.split("_")[1])
@@ -319,7 +324,15 @@ def md_table(headers: list[str], col_keys: list[str], data: list[dict]) -> str:
319324

320325

321326
def write_plots(rows: list[dict], path: Path) -> None:
322-
import matplotlib.pyplot as plt
327+
try:
328+
import matplotlib
329+
import matplotlib.ticker
330+
331+
matplotlib.use("Agg")
332+
import matplotlib.pyplot as plt
333+
except ImportError:
334+
print("matplotlib not installed; skipping plot generation.", file=sys.stderr)
335+
return
323336

324337
successful = [r for r in rows if r["status"] == "ok"]
325338
if not successful:
@@ -344,7 +357,9 @@ def write_plots(rows: list[dict], path: Path) -> None:
344357
ax.set_ylabel(ylabel)
345358
ax.set_xscale("log", base=2)
346359
ax.set_xticks(x)
347-
ax.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda v, _: str(int(v))))
360+
ax.get_xaxis().set_major_formatter(
361+
matplotlib.ticker.FuncFormatter(lambda v, _: str(int(v)))
362+
)
348363
ax.legend(loc="upper left")
349364
ax.grid(True, which="both", linestyle="--", alpha=0.5)
350365

0 commit comments

Comments
 (0)