Skip to content

Commit c04f1c1

Browse files
authored
chore: move benchmarking related code (#3406)
# What does this PR do? - moving things and some formatting changes ## Test Plan
1 parent d2f88a1 commit c04f1c1

File tree

10 files changed

+156
-149
lines changed

10 files changed

+156
-149
lines changed

docs/source/distributions/k8s-benchmark/README.md renamed to benchmarking/k8s-benchmark/README.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati
3434

3535
**1. Deploy base k8s infrastructure:**
3636
```bash
37-
cd ../k8s
37+
cd ../../docs/source/distributions/k8s
3838
./apply.sh
3939
```
4040

4141
**2. Deploy benchmark components:**
4242
```bash
43-
cd ../k8s-benchmark
4443
./apply.sh
4544
```
4645

@@ -56,7 +55,6 @@ kubectl get pods
5655

5756
**Benchmark Llama Stack (default):**
5857
```bash
59-
cd docs/source/distributions/k8s-benchmark/
6058
./run-benchmark.sh
6159
```
6260

docs/source/distributions/k8s-benchmark/benchmark.py renamed to benchmarking/k8s-benchmark/benchmark.py

Lines changed: 63 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import random
1515
import statistics
1616
import time
17-
from typing import Tuple
17+
1818
import aiohttp
1919

2020

@@ -55,67 +55,67 @@ def print_summary(self):
5555

5656
total_time = self.end_time - self.start_time
5757
success_rate = (self.success_count / self.total_requests) * 100
58-
59-
print(f"\n{'='*60}")
60-
print(f"BENCHMARK RESULTS")
61-
62-
print(f"\nResponse Time Statistics:")
58+
59+
print(f"\n{'=' * 60}")
60+
print("BENCHMARK RESULTS")
61+
62+
print("\nResponse Time Statistics:")
6363
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
6464
print(f" Median: {statistics.median(self.response_times):.3f}s")
6565
print(f" Min: {min(self.response_times):.3f}s")
6666
print(f" Max: {max(self.response_times):.3f}s")
67-
67+
6868
if len(self.response_times) > 1:
6969
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
70-
70+
7171
percentiles = [50, 90, 95, 99]
7272
sorted_times = sorted(self.response_times)
73-
print(f"\nPercentiles:")
73+
print("\nPercentiles:")
7474
for p in percentiles:
7575
idx = int(len(sorted_times) * p / 100) - 1
7676
idx = max(0, min(idx, len(sorted_times) - 1))
7777
print(f" P{p}: {sorted_times[idx]:.3f}s")
78-
78+
7979
if self.ttft_times:
80-
print(f"\nTime to First Token (TTFT) Statistics:")
80+
print("\nTime to First Token (TTFT) Statistics:")
8181
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
8282
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
8383
print(f" Min: {min(self.ttft_times):.3f}s")
8484
print(f" Max: {max(self.ttft_times):.3f}s")
85-
85+
8686
if len(self.ttft_times) > 1:
8787
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
88-
88+
8989
sorted_ttft = sorted(self.ttft_times)
90-
print(f"\nTTFT Percentiles:")
90+
print("\nTTFT Percentiles:")
9191
for p in percentiles:
9292
idx = int(len(sorted_ttft) * p / 100) - 1
9393
idx = max(0, min(idx, len(sorted_ttft) - 1))
9494
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
95-
95+
9696
if self.chunks_received:
97-
print(f"\nStreaming Statistics:")
97+
print("\nStreaming Statistics:")
9898
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
9999
print(f" Total chunks received: {sum(self.chunks_received)}")
100-
101-
print(f"{'='*60}")
100+
101+
print(f"{'=' * 60}")
102102
print(f"Total time: {total_time:.2f}s")
103103
print(f"Concurrent users: {self.concurrent_users}")
104104
print(f"Total requests: {self.total_requests}")
105105
print(f"Successful requests: {self.success_count}")
106106
print(f"Failed requests: {len(self.errors)}")
107107
print(f"Success rate: {success_rate:.1f}%")
108108
print(f"Requests per second: {self.success_count / total_time:.2f}")
109-
109+
110110
if self.errors:
111-
print(f"\nErrors (showing first 5):")
111+
print("\nErrors (showing first 5):")
112112
for error in self.errors[:5]:
113113
print(f" {error}")
114114

115115

116116
class LlamaStackBenchmark:
117117
def __init__(self, base_url: str, model_id: str):
118-
self.base_url = base_url.rstrip('/')
118+
self.base_url = base_url.rstrip("/")
119119
self.model_id = model_id
120120
self.headers = {"Content-Type": "application/json"}
121121
self.test_messages = [
@@ -126,74 +126,67 @@ def __init__(self, base_url: str, model_id: str):
126126
[
127127
{"role": "user", "content": "What is machine learning?"},
128128
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
129-
{"role": "user", "content": "Can you give me a practical example?"}
130-
]
129+
{"role": "user", "content": "Can you give me a practical example?"},
130+
],
131131
]
132132

133-
134-
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
133+
async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
135134
"""Make a single async streaming chat completion request."""
136135
messages = random.choice(self.test_messages)
137-
payload = {
138-
"model": self.model_id,
139-
"messages": messages,
140-
"stream": True,
141-
"max_tokens": 100
142-
}
143-
136+
payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
137+
144138
start_time = time.time()
145139
chunks_received = 0
146140
ttft = None
147141
error = None
148-
142+
149143
session = aiohttp.ClientSession()
150-
144+
151145
try:
152146
async with session.post(
153147
f"{self.base_url}/chat/completions",
154148
headers=self.headers,
155149
json=payload,
156-
timeout=aiohttp.ClientTimeout(total=30)
150+
timeout=aiohttp.ClientTimeout(total=30),
157151
) as response:
158152
if response.status == 200:
159153
async for line in response.content:
160154
if line:
161-
line_str = line.decode('utf-8').strip()
162-
if line_str.startswith('data: '):
155+
line_str = line.decode("utf-8").strip()
156+
if line_str.startswith("data: "):
163157
chunks_received += 1
164158
if ttft is None:
165159
ttft = time.time() - start_time
166-
if line_str == 'data: [DONE]':
160+
if line_str == "data: [DONE]":
167161
break
168-
162+
169163
if chunks_received == 0:
170164
error = "No streaming chunks received"
171165
else:
172166
text = await response.text()
173167
error = f"HTTP {response.status}: {text[:100]}"
174-
168+
175169
except Exception as e:
176170
error = f"Request error: {str(e)}"
177171
finally:
178172
await session.close()
179-
173+
180174
response_time = time.time() - start_time
181175
return response_time, chunks_received, ttft, error
182176

183-
184177
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
185178
"""Run benchmark using async requests for specified duration."""
186179
stats = BenchmarkStats()
187180
stats.concurrent_users = concurrent_users
188181
stats.start_time = time.time()
189-
182+
190183
print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
191184
print(f"Target URL: {self.base_url}/chat/completions")
192185
print(f"Model: {self.model_id}")
193-
186+
194187
connector = aiohttp.TCPConnector(limit=concurrent_users)
195-
async with aiohttp.ClientSession(connector=connector) as session:
196-
188+
async with aiohttp.ClientSession(connector=connector):
189+
197190
async def worker(worker_id: int):
198191
"""Worker that sends requests sequentially until canceled."""
199192
request_count = 0
@@ -202,12 +195,12 @@ async def worker(worker_id: int):
202195
response_time, chunks, ttft, error = await self.make_async_streaming_request()
203196
await stats.add_result(response_time, chunks, ttft, error)
204197
request_count += 1
205-
198+
206199
except asyncio.CancelledError:
207200
break
208201
except Exception as e:
209202
await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
210-
203+
211204
# Progress reporting task
212205
async def progress_reporter():
213206
last_report_time = time.time()
@@ -216,48 +209,52 @@ async def progress_reporter():
216209
await asyncio.sleep(1) # Report every second
217210
if time.time() >= last_report_time + 10: # Report every 10 seconds
218211
elapsed = time.time() - stats.start_time
219-
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}")
212+
print(
213+
f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
214+
)
220215
last_report_time = time.time()
221216
except asyncio.CancelledError:
222217
break
223-
218+
224219
# Spawn concurrent workers
225220
tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
226221
progress_task = asyncio.create_task(progress_reporter())
227222
tasks.append(progress_task)
228-
223+
229224
# Wait for duration then cancel all tasks
230225
await asyncio.sleep(duration)
231-
226+
232227
for task in tasks:
233228
task.cancel()
234-
229+
235230
# Wait for all tasks to complete
236231
await asyncio.gather(*tasks, return_exceptions=True)
237-
232+
238233
stats.end_time = time.time()
239234
return stats
240235

241236

242237
def main():
243238
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
244-
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
245-
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
246-
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
247-
help="Model ID to use for requests")
248-
parser.add_argument("--duration", type=int, default=60,
249-
help="Duration in seconds to run benchmark (default: 60)")
250-
parser.add_argument("--concurrent", type=int, default=10,
251-
help="Number of concurrent users (default: 10)")
252-
239+
parser.add_argument(
240+
"--base-url",
241+
default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
242+
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
243+
)
244+
parser.add_argument(
245+
"--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
246+
)
247+
parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
248+
parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
249+
253250
args = parser.parse_args()
254-
251+
255252
benchmark = LlamaStackBenchmark(args.base_url, args.model)
256-
253+
257254
try:
258255
stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
259256
stats.print_summary()
260-
257+
261258
except KeyboardInterrupt:
262259
print("\nBenchmark interrupted by user")
263260
except Exception as e:

0 commit comments

Comments
 (0)