1
1
import subprocess
2
2
import time
3
+ from typing import Optional
3
4
4
5
import pytest
5
6
import requests
@@ -16,15 +17,15 @@ def __init__(
16
17
self ,
17
18
port : int ,
18
19
model : str ,
19
- lora : list [str ] | None = None ,
20
- mode : str | None = None ,
21
- echo : bool | None = None ,
22
- random : bool | None = None ,
23
- time_to_first_token : float | None = None ,
24
- inter_token_latency : float | None = None ,
25
- max_loras : int | None = None ,
26
- max_cpu_loras : int | None = None ,
27
- max_running_requests : int | None = None ,
20
+ lora : Optional [ list [str ]] = None ,
21
+ mode : Optional [ str ] = None ,
22
+ echo : Optional [ bool ] = None ,
23
+ random : Optional [ bool ] = None ,
24
+ time_to_first_token : Optional [ float ] = None ,
25
+ inter_token_latency : Optional [ float ] = None ,
26
+ max_loras : Optional [ int ] = None ,
27
+ max_cpu_loras : Optional [ int ] = None ,
28
+ max_running_requests : Optional [ int ] = None ,
28
29
):
29
30
self .port = port
30
31
self .model = model
@@ -40,13 +41,10 @@ def __init__(
40
41
self .server_url = f"http://127.0.0.1:{ self .port } "
41
42
self .health_url = f"{ self .server_url } /health"
42
43
self .app_script = "./bin/llm-d-inference-sim"
43
- self .process = None
44
+ self .process : Optional [ subprocess . Popen ] = None
44
45
45
46
def get_cli_parameters (self ) -> list [str ]:
46
- parameters = [
47
- "--port" , f"{ self .port } " ,
48
- "--model" , self .model
49
- ]
47
+ parameters = ["--port" , f"{ self .port } " , "--model" , self .model ]
50
48
if self .lora is not None :
51
49
parameters .extend (["--lora" , "," .join (self .lora )])
52
50
if self .mode is not None :
@@ -64,19 +62,20 @@ def get_cli_parameters(self) -> list[str]:
64
62
if self .max_cpu_loras is not None :
65
63
parameters .extend (["--max-cpu-loras" , f"{ self .max_cpu_loras } " ])
66
64
if self .max_running_requests is not None :
67
- parameters .extend (["--max-running-requests" , f"{ self .max_running_requests } " ])
65
+ parameters .extend (
66
+ ["--max-running-requests" , f"{ self .max_running_requests } " ]
67
+ )
68
68
return parameters
69
69
70
70
def start (self ):
71
71
"""
72
72
Starts the server process and waits for it to become healthy.
73
73
"""
74
74
75
- logger .info (f"Starting server on { self .server_url } "
76
- f" using { self .app_script } ..." )
75
+ logger .info (f"Starting server on { self .server_url } using { self .app_script } ..." )
77
76
cli_parameters = self .get_cli_parameters ()
78
77
command = " " .join ([self .app_script , * cli_parameters ])
79
- logger .info (f"Server command: { command } " ) # ./bin/llm-d-inference-sim --model databricks/dolly-v2-12b --port 8000
78
+ logger .info (f"Server command: { command } " )
80
79
self .process = subprocess .Popen ( # noqa: S603
81
80
[self .app_script , * cli_parameters ],
82
81
stdout = subprocess .PIPE ,
0 commit comments