improve gguf-function-calling parser

okaris · okaris · commit c3112c76ff87 · 2025-09-17T15:37:57.000Z
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -4143,14 +4143,15 @@ def gguf_function_calling(
     Iterator[llama_types.CreateChatCompletionStreamResponse],
 ]:
     
-    function_calling_template = None
+    # Try to get model-specific template from metadata
+    model_template = None
     if hasattr(llama, 'model_path'):
         metadata = llama.metadata
         if metadata and "tokenizer.chat_template" in metadata:
-            function_calling_template = metadata["tokenizer.chat_template"]
+            model_template = metadata["tokenizer.chat_template"]
 
-
-    function_calling_template = (
+    # Use model template or fall back to default
+    function_calling_template = model_template if model_template else (
         "{% for message in messages %}"
         "<|im_start|>{{ message.role }}\n"
         # System message
@@ -4294,7 +4295,11 @@ def gguf_function_calling(
     )
     initial_gbnf_tool_grammar = (
         (
-            'root ::= "<function_calls>" "\\n" functions | "message:"\n'
+            'root ::= message_only | message_with_functions | functions_only\n'
+            'message_only ::= "message:" text\n'
+            'message_with_functions ::= "message:" text "<function_calls>\\n" functions\n'
+            'functions_only ::= "<function_calls>\\n" functions\n'
+            'text ::= [^<]+\n'
             f"functions ::= {function_names}\n"
         )
         if tool_choice == "auto"
@@ -4317,32 +4322,37 @@ def gguf_function_calling(
         ),
     )
     text = completion["choices"][0]["text"]
-    tool_name = None if text.startswith("message") else text.split("\n")[-1][len("functions.") :]
+    # Extract message content and/or function call
+    tool_name = None
+    message_content = None
+
+    if text.startswith("message:"):
+        # Handle message with or without function call
+        parts = text.split("<function_calls>", 1)
+        message_content = parts[0][len("message:"):].strip()
+        if len(parts) > 1:
+            # Has both message and function call
+            tool_name = parts[1].split("\n")[-1][len("functions.") :]
+    else:
+        # Only function call
+        tool_name = text.split("\n")[-1][len("functions.") :]
 
-    # Case 2 step 2A: Respond with a message
-    if tool_name is None:
-        prompt = template_renderer.render(
-            messages=messages, tools=[], tool_calls=None, add_generation_prompt=True
-        )
-        return _convert_completion_to_chat(
-            llama.create_completion(
-                prompt=prompt,
-                **completion_kwargs,  # type: ignore[arg-type]
-                logprobs=top_logprobs if logprobs else None,
-            ),
-            stream=stream,
-        )
+   # Case 2 step 2A: Respond with message only
+    if tool_name is None and message_content is not None:
+        completion["choices"][0]["text"] = message_content
+        return _convert_completion_to_chat(completion, stream=stream)
 
-    # Case 2 step 2B: One or more function calls
-    follow_up_gbnf_tool_grammar = (
-        'root ::= functions | "</function_calls>" | "<|im_end|>"\n'
-        f"functions ::= {function_names}\n"
-    )
-    prompt += "<function_calls>\n"
-    if stream:
-        return _stream_tool_calls(
-            llama, prompt, tools, tool_name, completion_kwargs, follow_up_gbnf_tool_grammar
+        # Case 2 step 2B: One or more function calls
+        follow_up_gbnf_tool_grammar = (
+            'root ::= functions | "</function_calls>" | "<|im_end|>"\n'
+            f"functions ::= {function_names}\n"
         )
+        prompt += "<function_calls>\n"
+        if stream:
+            return _stream_tool_calls(
+                llama, prompt, tools, tool_name, completion_kwargs, follow_up_gbnf_tool_grammar
+            )
+            
     tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
     completions: List[llama_types.CreateCompletionResponse] = []
     completions_tool_name: List[str] = []
@@ -4409,7 +4419,7 @@ def gguf_function_calling(
                 ),
                 "message": {
                     "role": "assistant",
-                    "content": None,
+                    "content": message_content,  # Include message content if present
                     "tool_calls": [
                         {
                             "id": "call_" + f"_{i}_" + tool_name + "_" + completion["id"],