1010 AutoTokenizer ,
1111 BitsAndBytesConfig ,
1212)
13+ import litellm
14+ import logging
15+ from litellm import completion
16+
17+ litellm ._logging ._disable_debugging ()
18+ loggers = [
19+ "LiteLLM Proxy" ,
20+ "LiteLLM Router" ,
21+ "LiteLLM" ,
22+ "httpx"
23+ ]
24+ for logger_name in loggers :
25+ logger = logging .getLogger (logger_name )
26+ logger .setLevel (logging .CRITICAL + 1 )
27+
1328
1429try :
1530 from vllm import LLM , SamplingParams
@@ -401,3 +416,96 @@ def generate_unconstrained(
401416 tokens_used += response .usage .total_tokens
402417 contents .append (content )
403418 return contents , tokens_used
419+
420+
421+ class LiteLLMGenerator :
422+ def __init__ (self , model_name , use_images = False ):
423+ self .use_images = use_images
424+ self .model_name = model_name
425+ self .api_base = (
426+ "http://0.0.0.0:8000/v1" if "hosted_vllm" in self .model_name else None
427+ )
428+
429+ def reset (self ):
430+ pass
431+
432+ def prepare_messages (
433+ self ,
434+ history : History ,
435+ max_messages_window : int ,
436+ ) -> tuple [list [dict ], list ]:
437+ """
438+ Prepare the image messages for the model
439+ """
440+ message_window = history .dialogue_history [- max_messages_window :]
441+ # remove the first assistant message if it is present
442+ if len (message_window ) > 0 and message_window [0 ]["role" ] == "assistant" :
443+ message_window = message_window [1 :]
444+ # add the system prompt if the first message is not a system message
445+ if message_window [0 ]["role" ] != "system" :
446+ message_window = [history .system_prompt_dialogue ] + message_window
447+
448+ if self .use_images :
449+
450+ message_window = copy .deepcopy (message_window )
451+ # copy the images to the history
452+ img_idx = - 1
453+ seen_images = 0
454+ # iterate through the messages in reverse order to assign images
455+ for i in range (len (message_window ) - 1 , - 1 , - 1 ):
456+ new_content_list = []
457+ for content in message_window [i ]["content" ]:
458+ if content ["type" ] == "text" :
459+ new_content_list .append (content )
460+ elif content ["type" ] == "image" :
461+ base64_image = numpy_to_base64 (history .images [img_idx ])
462+ img_idx -= 1
463+ seen_images + 1
464+ new_content = {
465+ "type" : "image_url" ,
466+ "image_url" : {
467+ "url" : f"data:image/jpeg;base64,{ base64_image } "
468+ },
469+ }
470+ new_content_list .append (new_content )
471+ message_window [i ]["content" ] = new_content_list
472+ assert seen_images <= len (history .images ), "Too many images"
473+
474+ return message_window , []
475+
476+ def generate_unconstrained (
477+ self ,
478+ batch_messages : list [list [dict ]],
479+ max_tokens = 256 ,
480+ temperature = 0.6 ,
481+ ** kwargs ,
482+ ) -> tuple [list [str ], int ]:
483+ contents = []
484+ tokens_used = 0
485+ for messages in batch_messages :
486+ response = completion (
487+ model = self .model_name ,
488+ messages = messages ,
489+ temperature = temperature ,
490+ max_tokens = max_tokens ,
491+ top_p = 1 ,
492+ frequency_penalty = 0 ,
493+ presence_penalty = 0 ,
494+ stop = ["\n " , "\n \n " ],
495+ api_base = self .api_base ,
496+ )
497+ content = response .choices [0 ].message .content
498+ content = self .clear_thinking_tokens (content )
499+ tokens_used += response .usage .total_tokens
500+ contents .append (content )
501+
502+ return contents , tokens_used
503+
504+ @staticmethod
505+ def clear_thinking_tokens (content : str ) -> str :
506+ """
507+ remove the thinking <think> text from the model.
508+ """
509+ if "</think>" in content :
510+ content = content .split ("</think>" )[- 1 ]
511+ return content .strip ()
0 commit comments