1414import  random 
1515import  statistics 
1616import  time 
17- from   typing   import   Tuple 
17+ 
1818import  aiohttp 
1919
2020
@@ -55,67 +55,67 @@ def print_summary(self):
5555
5656        total_time  =  self .end_time  -  self .start_time 
5757        success_rate  =  (self .success_count  /  self .total_requests ) *  100 
58-          
59-         print (f"\n { '=' * 60 }  )
60-         print (f "BENCHMARK RESULTS"
61-          
62-         print (f "\n Response Time Statistics:"
58+ 
59+         print (f"\n { '='   *   60 }  )
60+         print ("BENCHMARK RESULTS" )
61+ 
62+         print ("\n Response Time Statistics:" )
6363        print (f"  Mean: { statistics .mean (self .response_times ):.3f}  )
6464        print (f"  Median: { statistics .median (self .response_times ):.3f}  )
6565        print (f"  Min: { min (self .response_times ):.3f}  )
6666        print (f"  Max: { max (self .response_times ):.3f}  )
67-          
67+ 
6868        if  len (self .response_times ) >  1 :
6969            print (f"  Std Dev: { statistics .stdev (self .response_times ):.3f}  )
70-              
70+ 
7171        percentiles  =  [50 , 90 , 95 , 99 ]
7272        sorted_times  =  sorted (self .response_times )
73-         print (f "\n Percentiles:"
73+         print ("\n Percentiles:" )
7474        for  p  in  percentiles :
7575            idx  =  int (len (sorted_times ) *  p  /  100 ) -  1 
7676            idx  =  max (0 , min (idx , len (sorted_times ) -  1 ))
7777            print (f"  P{ p } { sorted_times [idx ]:.3f}  )
78-              
78+ 
7979        if  self .ttft_times :
80-             print (f "\n Time to First Token (TTFT) Statistics:"
80+             print ("\n Time to First Token (TTFT) Statistics:" )
8181            print (f"  Mean: { statistics .mean (self .ttft_times ):.3f}  )
8282            print (f"  Median: { statistics .median (self .ttft_times ):.3f}  )
8383            print (f"  Min: { min (self .ttft_times ):.3f}  )
8484            print (f"  Max: { max (self .ttft_times ):.3f}  )
85-              
85+ 
8686            if  len (self .ttft_times ) >  1 :
8787                print (f"  Std Dev: { statistics .stdev (self .ttft_times ):.3f}  )
88-                  
88+ 
8989            sorted_ttft  =  sorted (self .ttft_times )
90-             print (f "\n TTFT Percentiles:"
90+             print ("\n TTFT Percentiles:" )
9191            for  p  in  percentiles :
9292                idx  =  int (len (sorted_ttft ) *  p  /  100 ) -  1 
9393                idx  =  max (0 , min (idx , len (sorted_ttft ) -  1 ))
9494                print (f"  P{ p } { sorted_ttft [idx ]:.3f}  )
95-              
95+ 
9696        if  self .chunks_received :
97-             print (f "\n Streaming Statistics:"
97+             print ("\n Streaming Statistics:" )
9898            print (f"  Mean chunks per response: { statistics .mean (self .chunks_received ):.1f}  )
9999            print (f"  Total chunks received: { sum (self .chunks_received )}  )
100-          
101-         print (f"{ '=' * 60 }  )
100+ 
101+         print (f"{ '='   *   60 }  )
102102        print (f"Total time: { total_time :.2f}  )
103103        print (f"Concurrent users: { self .concurrent_users }  )
104104        print (f"Total requests: { self .total_requests }  )
105105        print (f"Successful requests: { self .success_count }  )
106106        print (f"Failed requests: { len (self .errors )}  )
107107        print (f"Success rate: { success_rate :.1f}  )
108108        print (f"Requests per second: { self .success_count  /  total_time :.2f}  )
109-          
109+ 
110110        if  self .errors :
111-             print (f "\n Errors (showing first 5):"
111+             print ("\n Errors (showing first 5):" )
112112            for  error  in  self .errors [:5 ]:
113113                print (f"  { error }  )
114114
115115
116116class  LlamaStackBenchmark :
117117    def  __init__ (self , base_url : str , model_id : str ):
118-         self .base_url  =  base_url .rstrip ('/' )
118+         self .base_url  =  base_url .rstrip ("/" )
119119        self .model_id  =  model_id 
120120        self .headers  =  {"Content-Type" : "application/json" }
121121        self .test_messages  =  [
@@ -126,74 +126,67 @@ def __init__(self, base_url: str, model_id: str):
126126            [
127127                {"role" : "user" , "content" : "What is machine learning?" },
128128                {"role" : "assistant" , "content" : "Machine learning is a subset of AI..." },
129-                 {"role" : "user" , "content" : "Can you give me a practical example?" }
130-             ]
129+                 {"role" : "user" , "content" : "Can you give me a practical example?" }, 
130+             ], 
131131        ]
132132
133- 
134-     async  def  make_async_streaming_request (self ) ->  Tuple [float , int , float  |  None , str  |  None ]:
133+     async  def  make_async_streaming_request (self ) ->  tuple [float , int , float  |  None , str  |  None ]:
135134        """Make a single async streaming chat completion request.""" 
136135        messages  =  random .choice (self .test_messages )
137-         payload  =  {
138-             "model" : self .model_id ,
139-             "messages" : messages ,
140-             "stream" : True ,
141-             "max_tokens" : 100 
142-         }
143-         
136+         payload  =  {"model" : self .model_id , "messages" : messages , "stream" : True , "max_tokens" : 100 }
137+ 
144138        start_time  =  time .time ()
145139        chunks_received  =  0 
146140        ttft  =  None 
147141        error  =  None 
148-          
142+ 
149143        session  =  aiohttp .ClientSession ()
150-          
144+ 
151145        try :
152146            async  with  session .post (
153147                f"{ self .base_url }  ,
154148                headers = self .headers ,
155149                json = payload ,
156-                 timeout = aiohttp .ClientTimeout (total = 30 )
150+                 timeout = aiohttp .ClientTimeout (total = 30 ), 
157151            ) as  response :
158152                if  response .status  ==  200 :
159153                    async  for  line  in  response .content :
160154                        if  line :
161-                             line_str  =  line .decode (' utf-8' strip ()
162-                             if  line_str .startswith (' data: ' 
155+                             line_str  =  line .decode (" utf-8" strip ()
156+                             if  line_str .startswith (" data: " 
163157                                chunks_received  +=  1 
164158                                if  ttft  is  None :
165159                                    ttft  =  time .time () -  start_time 
166-                                 if  line_str  ==  ' data: [DONE]' 
160+                                 if  line_str  ==  " data: [DONE]" 
167161                                    break 
168-                      
162+ 
169163                    if  chunks_received  ==  0 :
170164                        error  =  "No streaming chunks received" 
171165                else :
172166                    text  =  await  response .text ()
173167                    error  =  f"HTTP { response .status } { text [:100 ]}  
174-                      
168+ 
175169        except  Exception  as  e :
176170            error  =  f"Request error: { str (e )}  
177171        finally :
178172            await  session .close ()
179-              
173+ 
180174        response_time  =  time .time () -  start_time 
181175        return  response_time , chunks_received , ttft , error 
182176
183- 
184177    async  def  run_benchmark (self , duration : int , concurrent_users : int ) ->  BenchmarkStats :
185178        """Run benchmark using async requests for specified duration.""" 
186179        stats  =  BenchmarkStats ()
187180        stats .concurrent_users  =  concurrent_users 
188181        stats .start_time  =  time .time ()
189-          
182+ 
190183        print (f"Starting benchmark: { duration } { concurrent_users }  )
191184        print (f"Target URL: { self .base_url }  )
192185        print (f"Model: { self .model_id }  )
193-          
186+ 
194187        connector  =  aiohttp .TCPConnector (limit = concurrent_users )
195-         async  with  aiohttp .ClientSession (connector = connector )  as   session :
196-              
188+         async  with  aiohttp .ClientSession (connector = connector ):
189+ 
197190            async  def  worker (worker_id : int ):
198191                """Worker that sends requests sequentially until canceled.""" 
199192                request_count  =  0 
@@ -202,12 +195,12 @@ async def worker(worker_id: int):
202195                        response_time , chunks , ttft , error  =  await  self .make_async_streaming_request ()
203196                        await  stats .add_result (response_time , chunks , ttft , error )
204197                        request_count  +=  1 
205-                          
198+ 
206199                    except  asyncio .CancelledError :
207200                        break 
208201                    except  Exception  as  e :
209202                        await  stats .add_result (0 , 0 , None , f"Worker { worker_id } { str (e )}  )
210-              
203+ 
211204            # Progress reporting task 
212205            async  def  progress_reporter ():
213206                last_report_time  =  time .time ()
@@ -216,48 +209,52 @@ async def progress_reporter():
216209                        await  asyncio .sleep (1 )  # Report every second 
217210                        if  time .time () >=  last_report_time  +  10 :  # Report every 10 seconds 
218211                            elapsed  =  time .time () -  stats .start_time 
219-                             print (f"Completed: { stats .total_requests } { elapsed :.1f} { stats .total_requests  /  elapsed :.1f}  )
212+                             print (
213+                                 f"Completed: { stats .total_requests } { elapsed :.1f} { stats .total_requests  /  elapsed :.1f}  
214+                             )
220215                            last_report_time  =  time .time ()
221216                    except  asyncio .CancelledError :
222217                        break 
223-              
218+ 
224219            # Spawn concurrent workers 
225220            tasks  =  [asyncio .create_task (worker (i )) for  i  in  range (concurrent_users )]
226221            progress_task  =  asyncio .create_task (progress_reporter ())
227222            tasks .append (progress_task )
228-              
223+ 
229224            # Wait for duration then cancel all tasks 
230225            await  asyncio .sleep (duration )
231-              
226+ 
232227            for  task  in  tasks :
233228                task .cancel ()
234-              
229+ 
235230            # Wait for all tasks to complete 
236231            await  asyncio .gather (* tasks , return_exceptions = True )
237-          
232+ 
238233        stats .end_time  =  time .time ()
239234        return  stats 
240235
241236
242237def  main ():
243238    parser  =  argparse .ArgumentParser (description = "Llama Stack Benchmark Tool" )
244-     parser .add_argument ("--base-url" , default = os .getenv ("BENCHMARK_BASE_URL" , "http://localhost:8000/v1/openai/v1" ),
245-                        help = "Base URL for the API (default: http://localhost:8000/v1/openai/v1)" )
246-     parser .add_argument ("--model" , default = os .getenv ("INFERENCE_MODEL" , "test-model" ),
247-                        help = "Model ID to use for requests" )
248-     parser .add_argument ("--duration" , type = int , default = 60 ,
249-                        help = "Duration in seconds to run benchmark (default: 60)" )
250-     parser .add_argument ("--concurrent" , type = int , default = 10 ,
251-                        help = "Number of concurrent users (default: 10)" )
252-     
239+     parser .add_argument (
240+         "--base-url" ,
241+         default = os .getenv ("BENCHMARK_BASE_URL" , "http://localhost:8000/v1/openai/v1" ),
242+         help = "Base URL for the API (default: http://localhost:8000/v1/openai/v1)" ,
243+     )
244+     parser .add_argument (
245+         "--model" , default = os .getenv ("INFERENCE_MODEL" , "test-model" ), help = "Model ID to use for requests" 
246+     )
247+     parser .add_argument ("--duration" , type = int , default = 60 , help = "Duration in seconds to run benchmark (default: 60)" )
248+     parser .add_argument ("--concurrent" , type = int , default = 10 , help = "Number of concurrent users (default: 10)" )
249+ 
253250    args  =  parser .parse_args ()
254-      
251+ 
255252    benchmark  =  LlamaStackBenchmark (args .base_url , args .model )
256-      
253+ 
257254    try :
258255        stats  =  asyncio .run (benchmark .run_benchmark (args .duration , args .concurrent ))
259256        stats .print_summary ()
260-          
257+ 
261258    except  KeyboardInterrupt :
262259        print ("\n Benchmark interrupted by user" )
263260    except  Exception  as  e :
0 commit comments