@@ -33,17 +33,7 @@ def __init__(self, config: VLLMLLMConfig):
3333 api_key = api_key ,
3434 base_url = getattr (self .config , "api_base" , "http://localhost:8088/v1" )
3535 )
36- api_key = getattr (self .config , "api_key" , "dummy" )
37- if not api_key :
38- api_key = "dummy"
39-
40- import openai
41- self .client = openai .Client (
42- api_key = api_key ,
43- base_url = getattr (self .config , "api_base" , "http://localhost:8088/v1" )
44- )
4536
46- def build_vllm_kv_cache (self , messages : Any ) -> str :
4737 def build_vllm_kv_cache (self , messages : Any ) -> str :
4838 """
4939 Build a KV cache from chat messages via one vLLM request.
@@ -67,21 +57,12 @@ def build_vllm_kv_cache(self, messages: Any) -> str:
6757
6858 if not prompt .strip ():
6959 raise ValueError ("Prompt is empty, cannot build KV cache." )
70- raise ValueError ("Prompt is empty, cannot build KV cache." )
7160
7261 # 3. Send request to vLLM server to preload the KV cache
73- if self .client :
74- try :
75- # Use the processed messages for the API call
76- # 3. Send request to vLLM server to preload the KV cache
7762 if self .client :
7863 try :
7964 # Use the processed messages for the API call
8065 prefill_kwargs = {
81- "model" : self .config .model_name_or_path ,
82- "messages" : processed_messages ,
83- "max_tokens" : 2 ,
84- "temperature" : 0.0 ,
8566 "model" : self .config .model_name_or_path ,
8667 "messages" : processed_messages ,
8768 "max_tokens" : 2 ,
@@ -90,8 +71,6 @@ def build_vllm_kv_cache(self, messages: Any) -> str:
9071 }
9172 self .client .chat .completions .create (** prefill_kwargs )
9273 logger .info (f"vLLM KV cache prefill completed for prompt: '{ prompt [:100 ]} ...'" )
93- self .client .chat .completions .create (** prefill_kwargs )
94- logger .info (f"vLLM KV cache prefill completed for prompt: '{ prompt [:100 ]} ...'" )
9574 except Exception as e :
9675 logger .warning (f"Failed to prefill vLLM KV cache: { e } " )
9776
@@ -101,7 +80,6 @@ def generate(self, messages: list[MessageDict]) -> str:
10180 """
10281 Generate a response from the model.
10382 """
104- if self .client :
10583 if self .client :
10684 return self ._generate_with_api_client (messages )
10785 else :
@@ -111,11 +89,8 @@ def _generate_with_api_client(self, messages: list[MessageDict]) -> str:
11189 """
11290 Generate response using vLLM API client.
11391 """
114- if self .client :
11592 if self .client :
11693 completion_kwargs = {
117- "model" : self .config .model_name_or_path ,
118- "messages" : messages ,
11994 "model" : self .config .model_name_or_path ,
12095 "messages" : messages ,
12196 "temperature" : float (getattr (self .config , "temperature" , 0.8 )),
@@ -127,9 +102,6 @@ def _generate_with_api_client(self, messages: list[MessageDict]) -> str:
127102 response_text = response .choices [0 ].message .content or ""
128103 logger .info (f"VLLM API response: { response_text } " )
129104 return remove_thinking_tags (response_text ) if getattr (self .config , "remove_think_prefix" , False ) else response_text
130- response_text = response .choices [0 ].message .content or ""
131- logger .info (f"VLLM API response: { response_text } " )
132- return remove_thinking_tags (response_text ) if getattr (self .config , "remove_think_prefix" , False ) else response_text
133105 else :
134106 raise RuntimeError ("API client is not available" )
135107
@@ -142,7 +114,6 @@ def _messages_to_prompt(self, messages: list[MessageDict]) -> str:
142114 role = msg ["role" ]
143115 content = msg ["content" ]
144116 prompt_parts .append (f"{ role .capitalize ()} : { content } " )
145- prompt_parts .append (f"{ role .capitalize ()} : { content } " )
146117 return "\n " .join (prompt_parts )
147118
148119 def generate_stream (self , messages : list [MessageDict ]):
0 commit comments