|
2 | 2 |
|
3 | 3 | from ai_infra_bench import client_gen |
4 | 4 |
|
| 5 | +# input args |
5 | 6 | base_url = os.environ["BASE_URL"] |
6 | 7 | dataset_path = os.environ["SHAREGPT_DATASET"] |
| 8 | +input_features = [ |
| 9 | + "random_input_len", |
| 10 | + "random_output_len", |
| 11 | + "request_rate", |
| 12 | + "max_concurrency", |
| 13 | +] |
| 14 | +output_metrics = [ |
| 15 | + "mean_ttft_ms", |
| 16 | + "p99_ttft_ms", |
| 17 | + "mean_tpot_ms", |
| 18 | + "p99_tpot_ms", |
| 19 | + "mean_itl_ms", |
| 20 | + "p99_itl_ms", |
| 21 | + "mean_e2e_latency_ms", |
| 22 | + "p99_e2e_latency_ms", |
| 23 | + "output_throughput", |
| 24 | +] |
7 | 25 |
|
| 26 | +# construct client requests |
8 | 27 | client_template = """ |
9 | | -python -m sglang.bench_serving \ |
| 28 | +python -m sglang.bench_serving |
10 | 29 | --base-url {base_url} |
11 | 30 | --backend sglang-oai |
12 | | - --tokenizer deepseek-ai/DeepSeek-R1-0528 |
13 | | - --model deepseek-ai/DeepSeek-R1-0528 |
| 31 | + --tokenizer Qwen/Qwen3-0.6B |
| 32 | + --model Qwen/Qwen3-0.6B |
14 | 33 | --dataset-path {dataset_path} |
15 | 34 | --dataset-name random |
16 | 35 | --random-range-ratio 1 |
|
20 | 39 | --max-concurrency {request_rate} |
21 | 40 | --num-prompt {num_prompt} |
22 | 41 | """ |
23 | | -rate_lists = [1, 2, 4, 8, 16, 24, 32, 40] |
| 42 | +rate_lists = [1, 2, 4, 8] |
24 | 43 | client_cmds = [ |
25 | 44 | *[ |
26 | 45 | client_template.format( |
27 | 46 | base_url=base_url, |
28 | | - input_len=2000, |
29 | | - output_len=1500, |
| 47 | + input_len=1200, |
| 48 | + output_len=800, |
30 | 49 | dataset_path=dataset_path, |
31 | 50 | request_rate=rate, |
32 | | - num_prompt=rate * 10, |
| 51 | + num_prompt=min(max(rate * 10, 80), 250), # clip to [80, 250] |
33 | 52 | ) |
34 | 53 | for rate in rate_lists |
35 | 54 | ], |
36 | 55 | *[ |
37 | 56 | client_template.format( |
38 | 57 | base_url=base_url, |
39 | | - input_len=900, |
| 58 | + input_len=800, |
40 | 59 | output_len=1200, |
41 | 60 | dataset_path=dataset_path, |
42 | 61 | request_rate=rate, |
43 | | - num_prompt=rate * 10, |
| 62 | + num_prompt=min(max(rate * 10, 80), 250), # clip to [80, 250] |
44 | 63 | ) |
45 | 64 | for rate in rate_lists |
46 | 65 | ], |
|
51 | 70 | output_len=1500, |
52 | 71 | dataset_path=dataset_path, |
53 | 72 | request_rate=rate, |
54 | | - num_prompt=rate * 10, |
| 73 | + num_prompt=min(max(rate * 10, 80), 250), # clip to [80, 250] |
55 | 74 | ) |
56 | 75 | for rate in rate_lists |
57 | 76 | ], |
58 | 77 | ] |
59 | 78 |
|
60 | | -input_features = [ |
61 | | - "random_input_len", |
62 | | - "random_output_len", |
63 | | - "request_rate", |
64 | | - "max_concurrency", |
65 | | -] |
66 | | - |
67 | | -output_metrics = [ |
68 | | - "p99_ttft_ms", |
69 | | - "p99_tpot_ms", |
70 | | - "p99_itl_ms", |
71 | | - "output_throughput", |
72 | | - "p99_e2e_latency_ms", |
73 | | - "completed", |
74 | | -] |
75 | 79 |
|
76 | 80 | if __name__ == "__main__": |
77 | 81 | client_gen( |
78 | 82 | client_cmds=client_cmds, |
79 | 83 | input_features=input_features, |
80 | 84 | output_metrics=output_metrics, |
81 | | - server_labels="deepseek_r1", |
| 85 | + server_labels="qwen3_06b", |
| 86 | + n=3, |
| 87 | + only_last=True, |
82 | 88 | output_dir="output", |
83 | 89 | ) |
0 commit comments