improve gguf-function-calling parser

okaris · okaris · commit dccabe630369 · 2025-09-17T17:07:55.000Z
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -4107,7 +4107,6 @@ def chatml_function_calling(
         chat_completion["choices"][0]["message"]["function_call"] = single_function_call
     return chat_completion
 
-
 @register_chat_completion_handler("gguf-function-calling")
 def gguf_function_calling(
     llama: llama.Llama,
@@ -4142,18 +4141,9 @@ def gguf_function_calling(
     llama_types.CreateChatCompletionResponse,
     Iterator[llama_types.CreateChatCompletionStreamResponse],
 ]:
-    
-    function_calling_template = None
-    if hasattr(llama, 'model_path'):
-        metadata = llama.metadata
-        if metadata and "tokenizer.chat_template" in metadata:
-            function_calling_template = metadata["tokenizer.chat_template"]
-
-
     function_calling_template = (
         "{% for message in messages %}"
         "<|im_start|>{{ message.role }}\n"
-        # System message
         "{% if message.role == 'system' %}"
         "{{ message.content }}"
         "{% if tool_calls %}"
@@ -4178,22 +4168,16 @@ def gguf_function_calling(
         "{% endif %}"
         "<|im_end|>\n"
         "{% endif %}"
-        # User message
         "{% if message.role == 'user' %}"
         "{{ message.content }}"
         "<|im_end|>\n"
         "{% endif %}"
-        # Assistant message
         "{% if message.role == 'assistant' %}"
-        ## Regular message
         "{% if message.content and message.content | length > 0 %}"
-        "{% if tool_calls %}"
-        "message:\n"
-        "{% endif %}"
+        "{% if tool_calls %}message:\n{% endif %}"
         "{{ message.content }}"
         "<|im_end|>\n"
         "{% endif %}"
-        ## Function calls
         "{% if 'tool_calls' in message %}"
         "{% for tool_call in message.tool_calls %}"
         "functions.{{ tool_call.function.name }}:\n"
@@ -4210,27 +4194,23 @@ def gguf_function_calling(
         undefined=jinja2.StrictUndefined,
     ).from_string(function_calling_template)
 
-    # Convert legacy functions to tools
     if functions is not None:
         tools = [{"type": "function", "function": function} for function in functions]
 
-    # Convert legacy function_call to tool_choice
     if function_call is not None:
         if isinstance(function_call, str) and (function_call in ("none", "auto")):
             tool_choice = function_call
         if isinstance(function_call, dict) and "name" in function_call:
             tool_choice = {"type": "function", "function": {"name": function_call["name"]}}
 
-    # Collect the llama.create_completion keyword arguments so we don't have to repeat these with
-    # each completion call
     stop = (
         [stop, "<|im_end|>"]
         if isinstance(stop, str)
         else [*stop, "<|im_end|>"]
         if stop
         else ["<|im_end|>"]
     )
-    grammar = (  # It is assumed the grammar applies to messages only, not tool calls
+    grammar = (
         grammar
         if grammar is not None
         else (
@@ -4260,7 +4240,6 @@ def gguf_function_calling(
         "grammar": grammar,
     }
 
-    # Case 1: No tool use
     if (
         tool_choice is None
         or (isinstance(tool_choice, str) and tool_choice == "none")
@@ -4273,18 +4252,15 @@ def gguf_function_calling(
         return _convert_completion_to_chat(
             llama.create_completion(
                 prompt=prompt,
-                **completion_kwargs,  # type: ignore[arg-type]
+                **completion_kwargs,
                 logprobs=top_logprobs if logprobs else None,
             ),
             stream=stream,
         )
 
-    # Ensure there is a system prompt to attach the tool metadata to
     if not any(message["role"] == "system" for message in messages):
         messages = [*messages, {"role": "system", "content": ""}]
 
-    # Case 2: Automatic or fixed tool choice
-    # Case 2 step 1: Determine whether to respond with a message or a tool call
     assert (isinstance(tool_choice, str) and tool_choice == "auto") or isinstance(tool_choice, dict)
     if isinstance(tool_choice, dict):
         tools = [t for t in tools if t["function"]["name"] == tool_choice["function"]["name"]]
@@ -4309,7 +4285,7 @@ def gguf_function_calling(
         llama_types.CreateCompletionResponse,
         llama.create_completion(
             prompt=prompt,
-            **{  # type: ignore[arg-type]
+            **{
                 **completion_kwargs,
                 "temperature": 0,
                 "stream": False,
@@ -4322,40 +4298,33 @@ def gguf_function_calling(
         ),
     )
     text = completion["choices"][0]["text"]
-    
-    # Parse the response to extract message and/or function calls
+
     message_content = None
     tool_name = None
-    
+
     if text.startswith("message:"):
-        # Extract message content
         if "<function_calls>" in text:
-            # Combined message and function calls
             parts = text.split("<function_calls>", 1)
             message_content = parts[0][len("message:"):].strip()
             if len(parts) > 1 and "functions." in parts[1]:
                 tool_name = parts[1].split("functions.", 1)[1].split(":", 1)[0].strip()
         else:
-            # Message only
             message_content = text[len("message:"):].strip()
     elif text.startswith("<function_calls>") and "functions." in text:
-        # Function calls only
         tool_name = text.split("functions.", 1)[1].split(":", 1)[0].strip()
 
-    # Case 2 step 2A: Respond with message only
     if tool_name is None and message_content is not None:
         prompt = template_renderer.render(
             messages=messages, tools=[], tool_calls=None, add_generation_prompt=True
         )
         completion_response = llama.create_completion(
             prompt=prompt,
-            **completion_kwargs,  # type: ignore[arg-type]
+            **completion_kwargs,
             logprobs=top_logprobs if logprobs else None,
         )
         completion_response["choices"][0]["text"] = message_content
         return _convert_completion_to_chat(completion_response, stream=stream)
 
-    # Case 2 step 2B: One or more function calls
     follow_up_gbnf_tool_grammar = (
         'root ::= functions | "</function_calls>" | "<|im_end|>"\n'
         f"functions ::= {function_names}\n"
@@ -4369,7 +4338,6 @@ def gguf_function_calling(
     completions: List[llama_types.CreateCompletionResponse] = []
     completions_tool_name: List[str] = []
     while tool is not None and len(completions) <= 16:
-        # Generate the parameter values for the selected tool
         prompt += f"functions.{tool_name}:\n"
         try:
             grammar = llama_grammar.LlamaGrammar.from_json_schema(
@@ -4386,7 +4354,7 @@ def gguf_function_calling(
             )
         completion_or_chunks = llama.create_completion(
             prompt=prompt,
-            **{  # type: ignore[arg-type]
+            **{
                 **completion_kwargs,
                 "max_tokens": None,
                 "grammar": grammar,
@@ -4397,41 +4365,40 @@ def gguf_function_calling(
         completions_tool_name.append(tool_name)
         prompt += completion["choices"][0]["text"]
         prompt += "\n"
-        # Determine whether to call another tool or stop
         response = cast(
             llama_types.CreateCompletionResponse,
             llama.create_completion(
                 prompt=prompt,
-                **{  # type: ignore[arg-type]
+                **{
                     **completion_kwargs,
                     "temperature": 0,
                     "stream": False,
-                    "stop": [*completion_kwargs["stop"], ":", "</function_calls>"],  # type: ignore[misc]
+                    "stop": [*completion_kwargs["stop"], ":", "</function_calls>"],
                     "max_tokens": None,
                     "grammar": llama_grammar.LlamaGrammar.from_string(
                         follow_up_gbnf_tool_grammar, verbose=llama.verbose
                     ),
                 },
             ),
         )
-        tool_name = response["choices"][0]["text"][len("functions.") :]
-        tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
-    # Merge the completions into a single chat completion
+        tool_name = response["choices"][0]["text"][len("functions.") :] if response["choices"][0]["text"].startswith("functions.") else None
+        tool = next((tool for tool in tools if tool_name and tool["function"]["name"] == tool_name), None)
+
     chat_completion: llama_types.CreateChatCompletionResponse = {
         "id": "chat" + completion["id"],
         "object": "chat.completion",
         "created": completion["created"],
         "model": completion["model"],
         "choices": [
             {
-                "finish_reason": "tool_calls",
+                "finish_reason": "tool_calls" if completions else "stop",
                 "index": 0,
                 "logprobs": _convert_text_completion_logprobs_to_chat(
                     completion["choices"][0]["logprobs"]
                 ),
                 "message": {
                     "role": "assistant",
-                    "content": message_content,  # Include message content if present
+                    "content": message_content,
                     "tool_calls": [
                         {
                             "id": "call_" + f"_{i}_" + tool_name + "_" + completion["id"],
@@ -4444,7 +4411,9 @@ def gguf_function_calling(
                         for i, (tool_name, completion) in enumerate(
                             zip(completions_tool_name, completions)
                         )
-                    ],
+                    ]
+                    if completions
+                    else None,
                 },
             }
         ],
@@ -4465,8 +4434,8 @@ def gguf_function_calling(
     }
     if len(completions) == 1:
         single_function_call: llama_types.ChatCompletionResponseFunctionCall = {
-            "name": tool_name,
+            "name": completions_tool_name[0],
             "arguments": completions[0]["choices"][0]["text"],
         }
         chat_completion["choices"][0]["message"]["function_call"] = single_function_call
-    return chat_completion
+    return chat_completion