@@ -144,68 +144,75 @@ async def _async_launch_request(self, messages: List[Dict[str, str]], max_token
144144 start_time = time .time ()
145145 first_token_time = None
146146
147- # Make the request
148- response = await self .client .chat .completions .create (
149- model = model ,
150- messages = messages ,
151- stream = True ,
152- max_tokens = max_tokens ,
153- temperature = 0.0 ,
154- stream_options = {"include_usage" : True },
155- extra_headers = extra_headers ,
156- )
147+ try :
148+ # Make the request
149+ response = await self .client .chat .completions .create (
150+ model = model ,
151+ messages = messages ,
152+ stream = True ,
153+ max_tokens = max_tokens ,
154+ temperature = 0.0 ,
155+ stream_options = {"include_usage" : True },
156+ extra_headers = extra_headers ,
157+ )
158+
159+ # Process the streaming response
160+ async for chunk in response :
161+ if not chunk .choices :
162+ continue
157163
158- # Process the streaming response
159- async for chunk in response :
160- if not chunk .choices :
161- continue
162-
163- # Handle content
164- if chunk .choices [0 ].delta .content is not None :
165- if first_token_time is None and chunk .choices [0 ].delta .content != "" :
166- first_token_time = time .time ()
167- words += chunk .choices [0 ].delta .content
168-
169- # Handle token counts if available
170- if hasattr (chunk , 'usage' ) and chunk .usage is not None :
171- tokens_out = chunk .usage .completion_tokens
172- tokens_prefill = chunk .usage .prompt_tokens
173-
174- # If we didn't get token counts from streaming, try to get them from the final response
175- if tokens_out == 0 or tokens_prefill == 0 :
176- print ("No token counts from streaming, getting final response" )
177- print (f"{ tokens_out } , { tokens_prefill } " )
178- try :
179- final_response = await self .client .chat .completions .create (
180- model = model ,
181- messages = messages ,
182- stream = False ,
183- )
184- if hasattr (final_response , 'usage' ) and final_response .usage is not None :
185- tokens_out = final_response .usage .completion_tokens
186- tokens_prefill = final_response .usage .prompt_tokens
187- except Exception as e :
188- logging .warning (f"Failed to get token counts from final response: { e } " )
189-
190- # # Calculate timing metrics
191- ttft = first_token_time - start_time if first_token_time else 0
192- generation_time = time .time () - first_token_time if first_token_time else 0
193-
194- return Response (
195- body = words ,
196- ttft = ttft ,
197- generation_time = generation_time ,
198- prompt_tokens = tokens_prefill ,
199- generation_tokens = tokens_out ,
200- launch_time = start_time ,
201- finish_time = time .time (),
202- agentID = agentID ,
203- )
164+ # Handle content
165+ if chunk .choices [0 ].delta .content is not None :
166+ if first_token_time is None and chunk .choices [0 ].delta .content != "" :
167+ first_token_time = time .time ()
168+ words += chunk .choices [0 ].delta .content
169+
170+ # Handle token counts if available
171+ if hasattr (chunk , 'usage' ) and chunk .usage is not None :
172+ tokens_out = chunk .usage .completion_tokens
173+ tokens_prefill = chunk .usage .prompt_tokens
174+
175+ # If we didn't get token counts from streaming, try to get them from the final response
176+ if tokens_out == 0 or tokens_prefill == 0 :
177+ print ("No token counts from streaming, getting final response" )
178+ print (f"{ tokens_out } , { tokens_prefill } " )
179+ try :
180+ final_response = await self .client .chat .completions .create (
181+ model = model ,
182+ messages = messages ,
183+ stream = False ,
184+ )
185+ if hasattr (final_response , 'usage' ) and final_response .usage is not None :
186+ tokens_out = final_response .usage .completion_tokens
187+ tokens_prefill = final_response .usage .prompt_tokens
188+ except Exception as e :
189+ logging .warning (f"Failed to get token counts from final response: { e } " )
190+
191+ # # Calculate timing metrics
192+ ttft = first_token_time - start_time if first_token_time else 0
193+ generation_time = time .time () - first_token_time if first_token_time else 0
194+
195+ return Response (
196+ body = words ,
197+ ttft = ttft ,
198+ generation_time = generation_time ,
199+ prompt_tokens = tokens_prefill ,
200+ generation_tokens = tokens_out ,
201+ launch_time = start_time ,
202+ finish_time = time .time (),
203+ agentID = agentID ,
204+ )
205+ except openai .BadRequestError as e :
206+ logging .warning (f"BadRequestError with model { model } : { e } " )
207+ return None
208+ except Exception as e :
209+ logging .error (f"Error during request to model { model } : { e } " )
210+ return None
204211
205212 except Exception as e :
206213 logging .error (f"Error in _async_launch_request: { str (e )} " )
207214 logging .error (f"Request details - model: { model } , messages: { messages } " )
208- raise
215+ return None
209216
210217 def launch_request (
211218 self ,
@@ -218,11 +225,18 @@ def launch_request(
218225 """
219226 finish_callback: Callable[[Response, int], None]
220227 """
221- real_callback = lambda x : finish_callback (x .result (), agentID )
228+ def safe_callback (future ):
229+ try :
230+ result = future .result ()
231+ # The callback will handle the None case
232+ finish_callback (result , agentID )
233+ except Exception as e :
234+ logger .error (f"Error in callback: { e } " )
235+
222236 future = asyncio .run_coroutine_threadsafe (
223237 self ._async_launch_request (messages , max_tokens , agentID , extra_headers ), self .loop
224238 )
225- future .add_done_callback (real_callback )
239+ future .add_done_callback (safe_callback )
226240
227241
228242class UserSession :
@@ -309,7 +323,13 @@ def _launch_new_request(self, timestamp: float, request_executor: RequestExecuto
309323 self .has_unfinished_request = True
310324 self .last_request_time = timestamp
311325
312- def _on_request_finished (self , response : Response , agentID : int ):
326+ def _on_request_finished (self , response : Optional [Response ], agentID : int ):
327+ if response is None :
328+ logger .warning (f"User { self .user_config .user_id } request failed (likely context length exceeded)" )
329+ self .has_unfinished_request = False
330+ self .finished = True # Mark session as finished when request fails
331+ return
332+
313333 if self .user_config .whole_history :
314334 self .chat_history .on_system_response_whole (response .body , agentID )
315335 else :
@@ -666,7 +686,7 @@ def main():
666686 f"When --trace-file is omitted, you MUST supply: { ', ' .join (missing )} "
667687 )
668688
669- # From here on you know you’ re in exactly one mode:
689+ # From here on you know you' re in exactly one mode:
670690 if args .trace_file :
671691 print ("Running in trace‑mode, loading:" , args .trace_file )
672692 else :
0 commit comments