improve gguf-function-calling parser

okaris · okaris · commit 640e59f3ac3b · 2025-09-17T16:49:21.000Z
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -3975,23 +3975,9 @@ def chatml_function_calling(
         ),
     )
     text = completion["choices"][0]["text"]
-    # Extract message content and/or function call
-    tool_name = None
-    message_content = None
-    
-    if text.startswith("message:"):
-        # Handle message with or without function call
-        parts = text.split("<function_calls>", 1)
-        message_content = parts[0][len("message:"):].strip()
-        if len(parts) > 1:
-            # Has both message and function call
-            function_text = parts[1].strip()
-            tool_name = function_text.split("\n")[0][len("functions."):].rstrip(":")
-    else:
-        # Only function call
-        tool_name = text.split("\n")[0][len("<function_calls>\nfunctions."):].rstrip(":")
+    tool_name = None if text.startswith("message") else text.split("\n")[-1][len("functions.") :]
 
-    # Case 2 step 2A: Message only
+    # Case 2 step 2A: Respond with a message
     if tool_name is None:
         prompt = template_renderer.render(
             messages=messages, tools=[], tool_calls=None, add_generation_prompt=True
@@ -4081,7 +4067,7 @@ def chatml_function_calling(
                 ),
                 "message": {
                     "role": "assistant",
-                    "content": message_content,  # Include message content when present
+                    "content": None,
                     "tool_calls": [
                         {
                             "id": "call_" + f"_{i}_" + tool_name + "_" + completion["id"],
@@ -4157,15 +4143,14 @@ def gguf_function_calling(
     Iterator[llama_types.CreateChatCompletionStreamResponse],
 ]:
     
-    # Try to get model-specific template from metadata
-    model_template = None
+    function_calling_template = None
     if hasattr(llama, 'model_path'):
         metadata = llama.metadata
         if metadata and "tokenizer.chat_template" in metadata:
-            model_template = metadata["tokenizer.chat_template"]
+            function_calling_template = metadata["tokenizer.chat_template"]
+
 
-    # Use model template or fall back to default
-    function_calling_template = model_template if model_template else (
+    function_calling_template = (
         "{% for message in messages %}"
         "<|im_start|>{{ message.role }}\n"
         # System message
@@ -4189,6 +4174,7 @@ def gguf_function_calling(
         "\nfunctions.<function_name>:"
         '\n{ "arg1": "value1", "arg2": "value2" }'
         "\n</function_calls>"
+        "\n\nYou can also combine both formats to provide explanatory text with function calls."
         "{% endif %}"
         "<|im_end|>\n"
         "{% endif %}"
@@ -4309,12 +4295,15 @@ def gguf_function_calling(
     )
     initial_gbnf_tool_grammar = (
         (
-            'root ::= "<function_calls>" "\\n" functions | "message:" text | "message:" text "<function_calls>" "\\n" functions\n'
+            'root ::= message_only | message_with_functions | functions_only\n'
+            'message_only ::= "message:" text\n'
+            'message_with_functions ::= "message:" text "<function_calls>\\n" functions\n'
+            'functions_only ::= "<function_calls>\\n" functions\n'
             'text ::= [^<]+\n'
             f"functions ::= {function_names}\n"
         )
         if tool_choice == "auto"
-        else f'root ::= "<function_calls>" "\\n" functions\nfunctions ::= {function_names}\n'
+        else f'root ::= "<function_calls>\\n" functions\nfunctions ::= {function_names}\n'
     )
     completion = cast(
         llama_types.CreateCompletionResponse,
@@ -4333,37 +4322,49 @@ def gguf_function_calling(
         ),
     )
     text = completion["choices"][0]["text"]
-    # Extract message content and/or function call
-    tool_name = None
+    
+    # Parse the response to extract message and/or function calls
     message_content = None
-
+    tool_name = None
+    
     if text.startswith("message:"):
-        # Handle message with or without function call
-        parts = text.split("<function_calls>", 1)
-        message_content = parts[0][len("message:"):].strip()
-        if len(parts) > 1:
-            # Has both message and function call
-            tool_name = parts[1].split("\n")[-1][len("functions.") :]
-    else:
-        # Only function call
-        tool_name = text.split("\n")[-1][len("functions.") :]
+        # Extract message content
+        if "<function_calls>" in text:
+            # Combined message and function calls
+            parts = text.split("<function_calls>", 1)
+            message_content = parts[0][len("message:"):].strip()
+            if len(parts) > 1 and "functions." in parts[1]:
+                tool_name = parts[1].split("functions.", 1)[1].split(":", 1)[0].strip()
+        else:
+            # Message only
+            message_content = text[len("message:"):].strip()
+    elif text.startswith("<function_calls>") and "functions." in text:
+        # Function calls only
+        tool_name = text.split("functions.", 1)[1].split(":", 1)[0].strip()
 
-   # Case 2 step 2A: Respond with message only
+    # Case 2 step 2A: Respond with message only
     if tool_name is None and message_content is not None:
-        completion["choices"][0]["text"] = message_content
-        return _convert_completion_to_chat(completion, stream=stream)
+        prompt = template_renderer.render(
+            messages=messages, tools=[], tool_calls=None, add_generation_prompt=True
+        )
+        completion_response = llama.create_completion(
+            prompt=prompt,
+            **completion_kwargs,  # type: ignore[arg-type]
+            logprobs=top_logprobs if logprobs else None,
+        )
+        completion_response["choices"][0]["text"] = message_content
+        return _convert_completion_to_chat(completion_response, stream=stream)
 
-        # Case 2 step 2B: One or more function calls
-        follow_up_gbnf_tool_grammar = (
-            'root ::= functions | "</function_calls>" | "<|im_end|>"\n'
-            f"functions ::= {function_names}\n"
+    # Case 2 step 2B: One or more function calls
+    follow_up_gbnf_tool_grammar = (
+        'root ::= functions | "</function_calls>" | "<|im_end|>"\n'
+        f"functions ::= {function_names}\n"
+    )
+    prompt += "<function_calls>\n"
+    if stream:
+        return _stream_tool_calls(
+            llama, prompt, tools, tool_name, completion_kwargs, follow_up_gbnf_tool_grammar
         )
-        prompt += "<function_calls>\n"
-        if stream:
-            return _stream_tool_calls(
-                llama, prompt, tools, tool_name, completion_kwargs, follow_up_gbnf_tool_grammar
-            )
-            
     tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
     completions: List[llama_types.CreateCompletionResponse] = []
     completions_tool_name: List[str] = []
@@ -4430,7 +4431,7 @@ def gguf_function_calling(
                 ),
                 "message": {
                     "role": "assistant",
-                    "content": message_content,  # Include message content if present
+                    "content": None,
                     "tool_calls": [
                         {
                             "id": "call_" + f"_{i}_" + tool_name + "_" + completion["id"],