@@ -207,7 +207,6 @@ def __init__(self,
207207 self .add_api_route ("/v1/embeddings" , self ._embeddings_style_openai , methods = ["POST" ])
208208 self .add_api_route ("/v1/chat/completions" , self ._chat_completions , methods = ["POST" ])
209209
210- self .add_api_route ("/v1/models" , self ._models , methods = ["GET" ])
211210 self .add_api_route ("/tokenizer/{model_name}" , self ._tokenizer , methods = ["GET" ])
212211
213212 self ._inference_queue = inference_queue
@@ -483,35 +482,6 @@ async def _embeddings_style_openai(self, post: EmbeddingsStyleOpenAI, authorizat
483482 "usage" : {"prompt_tokens" : - 1 , "total_tokens" : - 1 }
484483 }
485484
486- async def _models (self , authorization : str = Header (None )):
487- await self ._account_from_bearer (authorization )
488- try :
489- async with aiohttp .ClientSession () as session :
490- async with session .get ("http://127.0.0.1:8001/v1/caps" ) as resp :
491- lsp_server_caps = await resp .json ()
492- except aiohttp .ClientConnectorError as e :
493- err_msg = f"LSP server is not ready yet: { e } "
494- log (err_msg )
495- raise HTTPException (status_code = 401 , detail = err_msg )
496- completion_models = set ()
497- for model , caps in lsp_server_caps ["code_completion_models" ].items ():
498- completion_models .update ({model , * caps ["similar_models" ]})
499- chat_models = set ()
500- for model , caps in lsp_server_caps ["code_chat_models" ].items ():
501- chat_models .update ({model , * caps ["similar_models" ]})
502- data = [
503- {
504- "id" : model , "root" : model , "object" : "model" ,
505- "created" : 0 , "owned_by" : "" , "permission" : [], "parent" : None ,
506- "completion" : model in completion_models , "chat" : model in chat_models ,
507- }
508- for model in lsp_server_caps ["running_models" ]
509- ]
510- return {
511- "object" : "list" ,
512- "data" : data ,
513- }
514-
515485 async def _chat_completions (self , post : ChatContext , authorization : str = Header (None )):
516486 def compose_usage_dict (model_dict , prompt_tokens_n , generated_tokens_n ) -> Dict [str , Any ]:
517487 usage_dict = dict ()
@@ -543,6 +513,7 @@ def _wrap_output(output: str) -> str:
543513 return prefix + output + postfix
544514
545515 model_dict = self ._model_assigner .models_db_with_passthrough .get (post .model , {})
516+ assert model_dict .get ('backend' ) == 'litellm'
546517
547518 async def litellm_streamer ():
548519 generated_tokens_n = 0
@@ -613,47 +584,14 @@ async def litellm_non_streamer():
613584 log (err_msg )
614585 yield json .dumps (_patch_caps_version ({"error" : err_msg }))
615586
616- async def chat_completion_streamer ():
617- post_url = "http://127.0.0.1:8001/v1/chat"
618- payload = {
619- "messages" : messages ,
620- "stream" : True ,
621- "model" : post .model ,
622- "parameters" : {
623- "temperature" : post .temperature ,
624- "max_new_tokens" : post .actual_max_tokens ,
625- }
626- }
627- async with aiohttp .ClientSession () as session :
628- try :
629- async with session .post (post_url , json = payload ) as response :
630- finish_reason = None
631- async for data , _ in response .content .iter_chunks ():
632- try :
633- data = data .decode ("utf-8" )
634- data = json .loads (data [len (prefix ):- len (postfix )])
635- finish_reason = data ["choices" ][0 ]["finish_reason" ]
636- data ["choices" ][0 ]["finish_reason" ] = None
637- except json .JSONDecodeError :
638- data = {"choices" : [{"finish_reason" : finish_reason }]}
639- yield _wrap_output (json .dumps (_patch_caps_version (data )))
640- except aiohttp .ClientConnectorError as e :
641- err_msg = f"LSP server is not ready yet: { e } "
642- log (err_msg )
643- yield _wrap_output (json .dumps (_patch_caps_version ({"error" : err_msg })))
644-
645- if model_dict .get ('backend' ) == 'litellm' :
646- model_name = model_dict .get ('resolve_as' , post .model )
647- if model_name not in litellm .model_list :
648- log (f"warning: requested model { model_name } is not in the litellm.model_list (this might not be the issue for some providers)" )
649- log (f"chat/completions: model resolve { post .model } -> { model_name } " )
650- prompt_tokens_n = litellm .token_counter (model_name , messages = messages )
651- if post .tools :
652- prompt_tokens_n += litellm .token_counter (model_name , text = json .dumps (post .tools ))
653- response_streamer = litellm_streamer () if post .stream else litellm_non_streamer ()
654- else :
655- # TODO: unused refact-lsp logic, remove ASAP
656- response_streamer = chat_completion_streamer ()
587+ model_name = model_dict .get ('resolve_as' , post .model )
588+ if model_name not in litellm .model_list :
589+ log (f"warning: requested model { model_name } is not in the litellm.model_list (this might not be the issue for some providers)" )
590+ log (f"chat/completions: model resolve { post .model } -> { model_name } " )
591+ prompt_tokens_n = litellm .token_counter (model_name , messages = messages )
592+ if post .tools :
593+ prompt_tokens_n += litellm .token_counter (model_name , text = json .dumps (post .tools ))
594+ response_streamer = litellm_streamer () if post .stream else litellm_non_streamer ()
657595
658596 return StreamingResponse (response_streamer , media_type = "text/event-stream" )
659597
0 commit comments