Skip to content

Commit 041aa14

Browse files
feat: e2e tests for max error
1 parent 3418f74 commit 041aa14

File tree

9 files changed

+300
-7
lines changed

9 files changed

+300
-7
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,3 +178,5 @@ cython_debug/
178178
# Project specific files
179179
*.json
180180
*.yaml
181+
/bin
182+
uv.lock

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ dependencies = [
5353
"protobuf",
5454
"pydantic>=2.0.0",
5555
"pydantic-settings>=2.0.0",
56+
"pytest-timeout[dev]>=2.4.0",
5657
"pyyaml>=6.0.0",
5758
"rich",
5859
"transformers",
@@ -78,6 +79,7 @@ dev = [
7879
"pytest-cov~=5.0.0",
7980
"pytest-mock~=3.14.0",
8081
"pytest-rerunfailures~=14.0",
82+
"pytest-timeout~=2.4.0",
8183
"respx~=0.22.0",
8284

8385
# code quality

src/guidellm/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ class Settings(BaseSettings):
113113
default_async_loop_sleep: float = 10e-5
114114
logging: LoggingSettings = LoggingSettings()
115115
default_sweep_number: int = 10
116-
shutdown_poll_interval_seconds: float = 10
116+
shutdown_poll_interval_seconds: float = 1
117117
error_check_window_size: int = 10
118118

119119
# HTTP settings

tests/e2e/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# E2E tests
2+
The E2E tests in GuideLLM use the [vLLM simulator by llm-d](https://llm-d.ai/docs/architecture/Components/inf-simulator), to run them run the following command:
3+
```shell
4+
docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./
5+
```

tests/e2e/test_basic.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# test_server_interaction.py
2+
3+
import json
4+
import os
5+
from pathlib import Path
6+
7+
import pytest
8+
from loguru import logger
9+
10+
from tests.e2e.vllm_sim_server import VllmSimServer
11+
12+
13+
@pytest.fixture(scope="module")
14+
def server():
15+
"""
16+
Pytest fixture to start and stop the server for the entire module
17+
using the TestServer class.
18+
"""
19+
server = VllmSimServer(port=8000, model="databricks/dolly-v2-12b")
20+
try:
21+
server.start()
22+
yield server # Yield the URL for tests to use
23+
finally:
24+
server.stop() # Teardown: Stop the server after tests are done
25+
26+
27+
@pytest.mark.timeout(30)
28+
def test_basic_report(server: VllmSimServer):
29+
"""
30+
Another example test interacting with the server.
31+
"""
32+
report_path = Path("tests/e2e/benchmarks.json")
33+
rate = 10
34+
command = f"""
35+
guidellm benchmark \
36+
--target "{server.get_url()}" \
37+
--rate-type constant \
38+
--rate {rate} \
39+
--max-seconds 1 \
40+
--data "prompt_tokens=256,output_tokens=128" \
41+
--output-path {report_path}
42+
"""
43+
logger.info(f"Client command: {command}") # guidellm benchmark --target "http://127.0.0.1:8000" --rate-type constant --rate 5 --max-seconds 5 --data "prompt_tokens=256,output_tokens=128"
44+
os.system(command) # noqa: S605
45+
46+
assert report_path.exists()
47+
with report_path.open("r") as f:
48+
report = json.load(f)
49+
50+
assert "benchmarks" in report
51+
benchmarks = report["benchmarks"]
52+
assert len(benchmarks) > 0
53+
benchmark = benchmarks[0]
54+
assert "requests" in benchmark
55+
requests = benchmark["requests"]
56+
assert "successful" in requests
57+
successful = requests["successful"]
58+
assert len(successful) == rate
59+
60+
report_path.unlink()

tests/e2e/test_interrupted.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# test_server_interaction.py
2+
3+
import json
4+
import os
5+
from pathlib import Path
6+
import subprocess
7+
import time
8+
9+
10+
import pytest
11+
from loguru import logger
12+
13+
from tests.e2e.vllm_sim_server import VllmSimServer
14+
15+
16+
@pytest.fixture(scope="module")
17+
def server():
18+
"""
19+
Pytest fixture to start and stop the server for the entire module
20+
using the TestServer class.
21+
"""
22+
server = VllmSimServer(port=8000, model="databricks/dolly-v2-12b")
23+
try:
24+
server.start()
25+
yield server # Yield the URL for tests to use
26+
finally:
27+
server.stop() # Teardown: Stop the server after tests are done
28+
29+
30+
@pytest.mark.timeout(30)
31+
def test_interrupted_report(server: VllmSimServer):
32+
"""
33+
Another example test interacting with the server.
34+
"""
35+
report_path = Path("tests/e2e/benchmarks.json")
36+
rate = 10
37+
max_error_rate = 0.1
38+
command = f"""guidellm benchmark \
39+
--target "{server.get_url()}" \
40+
--rate-type constant \
41+
--rate {rate} \
42+
--max-seconds 1000 \
43+
--max-error {max_error_rate} \
44+
--data "prompt_tokens=256,output_tokens=128" \
45+
--output-path {report_path}
46+
"""
47+
logger.info(f"Client command: {command}") # guidellm benchmark --target "http://127.0.0.1:8000" --rate-type constant --rate 5 --max-seconds 5 --data "prompt_tokens=256,output_tokens=128"
48+
process = subprocess.Popen(["/bin/bash", "-c", command], # noqa: S603
49+
stdout=subprocess.PIPE,
50+
stderr=subprocess.PIPE,
51+
text=True)
52+
logger.info("Waiting for client to start...")
53+
time.sleep(5)
54+
server.stop()
55+
logger.info("Waiting for client to stop...")
56+
time.sleep(5)
57+
58+
logger.info("Fetching client output")
59+
stdout, stderr = process.communicate()
60+
logger.info(f"Client stdout:\n{stdout}")
61+
logger.info(f"Client stderr:\n{stderr}")
62+
63+
assert report_path.exists()
64+
with report_path.open("r") as f:
65+
report = json.load(f)
66+
67+
assert "benchmarks" in report
68+
benchmarks = report["benchmarks"]
69+
assert len(benchmarks) > 0
70+
benchmark = benchmarks[0]
71+
assert "requests" in benchmark
72+
requests = benchmark["requests"]
73+
assert "successful" in requests
74+
successful = requests["successful"]
75+
assert "errored" in requests
76+
errored = requests["errored"]
77+
assert len(errored) / (len(successful) + len(errored)) > max_error_rate
78+
79+
report_path.unlink()
80+
81+
process.terminate() # Send SIGTERM
82+
try:
83+
process.wait(timeout=5) # Wait for the process to terminate
84+
logger.info("Client stopped successfully.")
85+
except subprocess.TimeoutExpired:
86+
logger.warning("Client did not terminate gracefully, killing it...")
87+
process.kill() # Send SIGKILL if it doesn't terminate
88+
process.wait()

tests/e2e/test_placeholder.py

Lines changed: 0 additions & 6 deletions
This file was deleted.

tests/e2e/vllm-sim.Dockerfile

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
FROM golang AS base
2+
3+
WORKDIR /app
4+
5+
RUN git clone https://github.com/llm-d/llm-d-inference-sim.git && \
6+
cd llm-d-inference-sim && \
7+
make build
8+
9+
WORKDIR /app/llm-d-inference-sim
10+
11+
FROM scratch
12+
COPY --from=base /app/llm-d-inference-sim/bin/llm-d-inference-sim /bin/llm-d-inference-sim

tests/e2e/vllm_sim_server.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import subprocess
2+
import time
3+
4+
import pytest
5+
import requests
6+
from loguru import logger
7+
8+
9+
class VllmSimServer:
10+
"""
11+
A class to manage the lifecycle of a test server.
12+
Encapsulates starting, checking health, and stopping the server process.
13+
"""
14+
15+
def __init__(
16+
self,
17+
port: int,
18+
model: str,
19+
lora: list[str] | None = None,
20+
mode: str | None = None,
21+
echo: bool | None = None,
22+
random: bool | None = None,
23+
time_to_first_token: float | None = None,
24+
inter_token_latency: float | None = None,
25+
max_loras: int | None = None,
26+
max_cpu_loras: int | None = None,
27+
max_running_requests: int | None = None,
28+
):
29+
self.port = port
30+
self.model = model
31+
self.lora = lora
32+
self.mode = mode
33+
self.echo = echo
34+
self.random = random
35+
self.time_to_first_token = time_to_first_token
36+
self.inter_token_latency = inter_token_latency
37+
self.max_loras = max_loras
38+
self.max_cpu_loras = max_cpu_loras
39+
self.max_running_requests = max_running_requests
40+
self.server_url = f"http://127.0.0.1:{self.port}"
41+
self.health_url = f"{self.server_url}/health"
42+
self.app_script = "./bin/llm-d-inference-sim"
43+
self.process = None
44+
45+
def get_cli_parameters(self) -> list[str]:
46+
parameters = [
47+
"--port", f"{self.port}",
48+
"--model", self.model
49+
]
50+
if self.lora is not None:
51+
parameters.extend(["--lora", ",".join(self.lora)])
52+
if self.mode is not None:
53+
parameters.extend(["--mode", self.mode])
54+
if self.echo is not None:
55+
parameters.extend(["--echo"])
56+
if self.random is not None:
57+
parameters.extend(["--random"])
58+
if self.time_to_first_token is not None:
59+
parameters.extend(["--time-to-first-token", f"{self.time_to_first_token}"])
60+
if self.inter_token_latency is not None:
61+
parameters.extend(["--inter-token-latency", f"{self.inter_token_latency}"])
62+
if self.max_loras is not None:
63+
parameters.extend(["--max-loras", f"{self.max_loras}"])
64+
if self.max_cpu_loras is not None:
65+
parameters.extend(["--max-cpu-loras", f"{self.max_cpu_loras}"])
66+
if self.max_running_requests is not None:
67+
parameters.extend(["--max-running-requests", f"{self.max_running_requests}"])
68+
return parameters
69+
70+
def start(self):
71+
"""
72+
Starts the server process and waits for it to become healthy.
73+
"""
74+
75+
logger.info(f"Starting server on {self.server_url}"
76+
f" using {self.app_script}...")
77+
cli_parameters = self.get_cli_parameters()
78+
command = " ".join([self.app_script, *cli_parameters])
79+
logger.info(f"Server command: {command}") # ./bin/llm-d-inference-sim --model databricks/dolly-v2-12b --port 8000
80+
self.process = subprocess.Popen( # noqa: S603
81+
[self.app_script, *cli_parameters],
82+
stdout=subprocess.PIPE,
83+
stderr=subprocess.PIPE,
84+
text=True, # Decode stdout/stderr as text
85+
)
86+
87+
# Wait for the server to start and become healthy
88+
max_retries = 20
89+
retry_delay_sec = 0.5
90+
for i in range(max_retries):
91+
try:
92+
response = requests.get(self.health_url, timeout=1)
93+
if response.status_code == 200:
94+
logger.info(f"Server started successfully at {self.server_url}")
95+
return
96+
else:
97+
logger.warning(f"Got response with status: {response.status_code}")
98+
logger.warning(response.json())
99+
except requests.ConnectionError:
100+
logger.warning(f"Waiting for server... (attempt {i + 1}/{max_retries})")
101+
time.sleep(retry_delay_sec)
102+
# If the loop completes without breaking, the server didn't start
103+
stdout, stderr = self.process.communicate()
104+
logger.error(f"Server failed to start after {max_retries} retries.")
105+
logger.error(f"Server stdout:\n{stdout}")
106+
logger.error(f"Server stderr:\n{stderr}")
107+
self.stop() # Attempt to clean up
108+
pytest.fail("Server did not start within the expected time.")
109+
110+
def stop(self):
111+
"""
112+
Stops the server process.
113+
"""
114+
if self.process:
115+
logger.info(f"Stopping server on {self.server_url}...")
116+
self.process.terminate() # Send SIGTERM
117+
try:
118+
self.process.wait(timeout=5) # Wait for the process to terminate
119+
logger.info("Server stopped successfully.")
120+
except subprocess.TimeoutExpired:
121+
logger.warning("Server did not terminate gracefully, killing it...")
122+
self.process.kill() # Send SIGKILL if it doesn't terminate
123+
self.process.wait()
124+
self.process = None # Clear the process reference
125+
126+
def get_url(self):
127+
"""
128+
Returns the base URL of the running server.
129+
"""
130+
return self.server_url

0 commit comments

Comments
 (0)