11import argparse
2+ import code
23import json
4+ import signal
5+ import subprocess
6+ import traceback
37from typing import AsyncGenerator
48
59import uvicorn
@@ -46,9 +50,9 @@ async def stream_results() -> AsyncGenerator[str, None]:
4650 "text" : request_output .outputs [- 1 ].text [len (last_output_text ) :],
4751 "count_prompt_tokens" : len (request_output .prompt_token_ids ),
4852 "count_output_tokens" : len (request_output .outputs [0 ].token_ids ),
49- "log_probs" : request_output . outputs [ 0 ]. logprobs [ - 1 ]
50- if sampling_params .logprobs
51- else None ,
53+ "log_probs" : (
54+ request_output . outputs [ 0 ]. logprobs [ - 1 ] if sampling_params .logprobs else None
55+ ) ,
5256 "finished" : request_output .finished ,
5357 }
5458 last_output_text = request_output .outputs [- 1 ].text
@@ -88,7 +92,47 @@ async def abort_request() -> None:
8892 return Response (content = json .dumps (ret ))
8993
9094
95+ def get_gpu_free_memory ():
96+ """Get GPU free memory using nvidia-smi."""
97+ try :
98+ output = subprocess .check_output (
99+ ["nvidia-smi" , "--query-gpu=memory.free" , "--format=csv,noheader,nounits" ]
100+ ).decode ("utf-8" )
101+ gpu_memory = [int (x ) for x in output .strip ().split ("\n " )]
102+ return gpu_memory
103+ except subprocess .CalledProcessError :
104+ return None
105+
106+
107+ def check_unknown_startup_memory_usage ():
108+ """Check for unknown memory usage at startup."""
109+ gpu_free_memory = get_gpu_free_memory ()
110+ if gpu_free_memory is not None :
111+ min_mem = min (gpu_free_memory )
112+ max_mem = max (gpu_free_memory )
113+ if max_mem - min_mem > 10 :
114+ print (
115+ f"WARNING: Unbalanced GPU memory usage at start up. This may cause OOM. Memory usage per GPU in MB: { gpu_free_memory } ."
116+ )
117+ output = subprocess .check_output (["fuser -v /dev/nvidia*" ], shell = True ).decode ("utf-8" )
118+ print (f"Processes using GPU: { output } " )
119+
120+
121+ def debug (sig , frame ):
122+ """Interrupt running process, and provide a python prompt for
123+ interactive debugging."""
124+ d = {"_frame" : frame } # Allow access to frame object.
125+ d .update (frame .f_globals ) # Unless shadowed by global
126+ d .update (frame .f_locals )
127+
128+ i = code .InteractiveConsole (d )
129+ message = "Signal received : entering python shell.\n Traceback:\n "
130+ message += "" .join (traceback .format_stack (frame ))
131+ i .interact (message )
132+
133+
91134if __name__ == "__main__" :
135+ check_unknown_startup_memory_usage ()
92136 parser = argparse .ArgumentParser ()
93137 parser .add_argument ("--host" , type = str , default = None ) # None == IPv4 / IPv6 dualstack
94138 parser .add_argument ("--port" , type = int , default = 5005 )
@@ -98,6 +142,8 @@ async def abort_request() -> None:
98142 engine_args = AsyncEngineArgs .from_cli_args (args )
99143 engine = AsyncLLMEngine .from_engine_args (engine_args )
100144
145+ signal .signal (signal .SIGUSR1 , debug )
146+
101147 uvicorn .run (
102148 app ,
103149 host = args .host ,
0 commit comments