Improve client usage

Muqi1029 · Muqi1029 · commit ec0a5a984cb2 · 2025-12-30T18:25:18.000+08:00
diff --git a/examples/client_cmp.py b/examples/client_cmp.py
@@ -0,0 +1,76 @@
+import os
+
+from ai_infra_bench import client_cmp
+from ai_infra_bench.utils import ServerAccessInfo
+
+# input args
+input_len = 1200
+output_len = 800
+dataset_path = os.environ["SHAREGPT_DATASET"]
+input_features = [
+    "random_input_len",
+    "random_output_len",
+    "request_rate",
+    "max_concurrency",
+]
+output_metrics = [
+    "mean_ttft_ms",
+    "p99_ttft_ms",
+    "mean_tpot_ms",
+    "p99_tpot_ms",
+    "mean_itl_ms",
+    "p99_itl_ms",
+    "mean_e2e_latency_ms",
+    "p99_e2e_latency_ms",
+    "output_throughput",
+]
+
+# construct client requests
+# don't set --base-url due to it will be contained in the server access infos
+client_template = """
+python -m sglang.bench_serving
+		--backend sglang-oai
+        --tokenizer Qwen/Qwen3-0.6B
+        --model Qwen/Qwen3-0.6B
+		--dataset-path {dataset_path}
+		--dataset-name random
+		--random-range-ratio 1
+		--random-input-len {input_len}
+		--random-output-len {output_len}
+		--request-rate {request_rate}
+		--max-concurrency {request_rate}
+		--num-prompt {num_prompt}
+"""
+rate_list = [1, 2, 4, 8]
+client_cmds = [
+    client_template.format(
+        input_len=input_len,
+        output_len=output_len,
+        dataset_path=dataset_path,
+        request_rate=rate,
+        num_prompt=min(max(rate * 10, 80), 250),  # clip to [80, 250]
+    )
+    for rate in rate_list
+]
+
+# construct server access info
+server_access_infos = [
+    ServerAccessInfo(
+        base_url="http://localhost:8888", api_key="JustKeepMe", label="old"
+    ),
+    ServerAccessInfo(
+        base_url="http://localhost:8889", api_key="JustKeepMe", label="new"
+    ),
+]
+
+
+if __name__ == "__main__":
+    client_cmp(
+        server_access_infos=server_access_infos,
+        client_cmds=client_cmds,
+        input_features=input_features,
+        output_metrics=output_metrics,
+        n=3,
+        only_last=True,
+        output_dir="version_cmp_bench",
+    )
diff --git a/examples/client_gen.py b/examples/client_gen.py
@@ -2,15 +2,34 @@
 
 from ai_infra_bench import client_gen
 
+# input args
 base_url = os.environ["BASE_URL"]
 dataset_path = os.environ["SHAREGPT_DATASET"]
+input_features = [
+    "random_input_len",
+    "random_output_len",
+    "request_rate",
+    "max_concurrency",
+]
+output_metrics = [
+    "mean_ttft_ms",
+    "p99_ttft_ms",
+    "mean_tpot_ms",
+    "p99_tpot_ms",
+    "mean_itl_ms",
+    "p99_itl_ms",
+    "mean_e2e_latency_ms",
+    "p99_e2e_latency_ms",
+    "output_throughput",
+]
 
+# construct client requests
 client_template = """
-python -m sglang.bench_serving \
+python -m sglang.bench_serving
         --base-url {base_url}
 		--backend sglang-oai
-        --tokenizer deepseek-ai/DeepSeek-R1-0528
-        --model deepseek-ai/DeepSeek-R1-0528
+        --tokenizer Qwen/Qwen3-0.6B
+        --model Qwen/Qwen3-0.6B
 		--dataset-path {dataset_path}
 		--dataset-name random
 		--random-range-ratio 1
@@ -20,27 +39,27 @@
 		--max-concurrency {request_rate}
 		--num-prompt {num_prompt}
 """
-rate_lists = [1, 2, 4, 8, 16, 24, 32, 40]
+rate_lists = [1, 2, 4, 8]
 client_cmds = [
     *[
         client_template.format(
             base_url=base_url,
-            input_len=2000,
-            output_len=1500,
+            input_len=1200,
+            output_len=800,
             dataset_path=dataset_path,
             request_rate=rate,
-            num_prompt=rate * 10,
+            num_prompt=min(max(rate * 10, 80), 250),  # clip to [80, 250]
         )
         for rate in rate_lists
     ],
     *[
         client_template.format(
             base_url=base_url,
-            input_len=900,
+            input_len=800,
             output_len=1200,
             dataset_path=dataset_path,
             request_rate=rate,
-            num_prompt=rate * 10,
+            num_prompt=min(max(rate * 10, 80), 250),  # clip to [80, 250]
         )
         for rate in rate_lists
     ],
@@ -51,33 +70,20 @@
             output_len=1500,
             dataset_path=dataset_path,
             request_rate=rate,
-            num_prompt=rate * 10,
+            num_prompt=min(max(rate * 10, 80), 250),  # clip to [80, 250]
         )
         for rate in rate_lists
     ],
 ]
 
-input_features = [
-    "random_input_len",
-    "random_output_len",
-    "request_rate",
-    "max_concurrency",
-]
-
-output_metrics = [
-    "p99_ttft_ms",
-    "p99_tpot_ms",
-    "p99_itl_ms",
-    "output_throughput",
-    "p99_e2e_latency_ms",
-    "completed",
-]
 
 if __name__ == "__main__":
     client_gen(
         client_cmds=client_cmds,
         input_features=input_features,
         output_metrics=output_metrics,
-        server_labels="deepseek_r1",
+        server_labels="qwen3_06b",
+        n=3,
+        only_last=True,
         output_dir="output",
     )
diff --git a/examples/client_slo.py b/examples/client_slo.py
@@ -0,0 +1,71 @@
+import os
+from typing import Dict
+
+from ai_infra_bench import client_slo
+
+# input args
+input_len = 1200
+output_len = 800
+base_url = os.environ["BASE_URL"]
+dataset_path = os.environ["SHAREGPT_DATASET"]
+input_features = [
+    "random_input_len",
+    "random_output_len",
+    "request_rate",
+    "max_concurrency",
+]
+output_metrics = [
+    "mean_ttft_ms",
+    "p99_ttft_ms",
+    "mean_tpot_ms",
+    "p99_tpot_ms",
+    "mean_itl_ms",
+    "p99_itl_ms",
+    "mean_e2e_latency_ms",
+    "p99_e2e_latency_ms",
+    "output_throughput",
+]
+
+# construct client requests
+client_template = """
+python -m sglang.bench_serving
+        --base-url {base_url}
+		--backend sglang-oai
+        --tokenizer Qwen/Qwen3-0.6B
+        --model Qwen/Qwen3-0.6B
+		--dataset-path {dataset_path}
+		--dataset-name random
+		--random-range-ratio 1
+		--random-input-len {input_len}
+		--random-output-len {output_len}
+"""
+client_cmds = client_template.format(
+    base_url=base_url,
+    dataset_path=dataset_path,
+    input_len=input_len,
+    output_len=output_len,
+)
+
+
+def check_slo(item: Dict) -> bool:
+    return (
+        item["p99_ttft_ms"] < 3000
+        and item["p99_tpot_ms"] < 100
+        and item["p99_itl_ms"] < 100
+    )
+
+
+request_rates = [(20, 70)]
+
+
+if __name__ == "__main__":
+    client_slo(
+        client_cmds=client_cmds,
+        input_features=input_features,
+        output_metrics=output_metrics,
+        check_slo=check_slo,
+        request_rates=request_rates,
+        n=3,
+        only_last=True,
+        output_dir="client_slo_output",
+    )