From 38265ee3c783a6c157a51654c6c797245ce1b612 Mon Sep 17 00:00:00 2001
From: Tobias Perelstein <5562156+tobocop2@users.noreply.github.com>
Date: Thu, 21 May 2026 23:36:00 -0400
Subject: [PATCH 1/3] chat-format: ignore HuggingFace's {% generation %}
 chat-template tag

HuggingFace's transformers chat-template extension adds {% generation %}
and {% endgeneration %} tags so trainers can mark generation spans for
loss masking. The tags ship in GGUF tokenizer.chat_template metadata
(SmolLM3 et al), but jinja2's default environment doesn't recognize
them, so Llama() raises TemplateSyntaxError at init for any affected
GGUF, even when the caller passes an explicit chat_format override.

Register a minimal Jinja extension that treats both tags as inert
wrappers: the body between them renders as-is, the markers themselves
emit nothing. No behavioral change for templates that don't use the
tags.

Prior art: PR #2082 attempted the same approach but referenced an
unimported 'nodes' module and didn't consume the body or closing tag.
---
 llama_cpp/llama_chat_format.py  | 13 +++++++++++++
 tests/test_llama_chat_format.py | 23 +++++++++++++++++++++++
 2 files changed, 36 insertions(+)
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 1024fb85b..ac8c02513 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -24,6 +24,8 @@
 )
 
 import jinja2
+from jinja2 import nodes
+from jinja2.ext import Extension
 from jinja2.sandbox import ImmutableSandboxedEnvironment
 
 import numpy as np
@@ -191,6 +193,16 @@ def __call__(
     ) -> ChatFormatterResponse: ...
 
 
+class _GenerationTagIgnore(Extension):
+    """Pass-through for HuggingFace's ``{% generation %}`` chat-template tag."""
+
+    tags = {"generation"}
+
+    def parse(self, parser: jinja2.parser.Parser) -> List[nodes.Node]:
+        parser.stream.skip(1)  # discard the 'generation' tag-name token
+        return parser.parse_statements(("name:endgeneration",), drop_needle=True)
+
+
 class Jinja2ChatFormatter(ChatFormatter):
     def __init__(
         self,
@@ -213,6 +225,7 @@ def __init__(
             loader=jinja2.BaseLoader(),
             trim_blocks=True,
             lstrip_blocks=True,
+            extensions=[_GenerationTagIgnore],
         ).from_string(self.template)
 
     @staticmethod
diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
index 18c7279cf..16852a472 100644
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@@ -92,3 +92,26 @@ def test_hf_tokenizer_config_str_to_chat_formatter():
     )
 
     assert chat_formatter_respoonse.prompt == ("<s>[INST] Hello, world! [/INST]</s>")
+
+
+def test_generation_tag_is_ignored() -> None:
+    """HuggingFace chat templates use {% generation %}/{% endgeneration %} to
+    mark training-time loss spans. At inference the tags must be no-ops or
+    affected GGUFs (SmolLM3 and similar) fail to load with TemplateSyntaxError.
+    """
+    template = (
+        "{% for message in messages %}"
+        "{% generation %}{{ message['role'] }}: {{ message['content'] }}{% endgeneration %}"
+        "{% endfor %}"
+    )
+    chat_formatter = llama_chat_format.Jinja2ChatFormatter(
+        template=template,
+        eos_token="</s>",
+        bos_token="<s>",
+    )
+    response = chat_formatter(
+        messages=[
+            ChatCompletionRequestUserMessage(role="user", content="hi"),
+        ]
+    )
+    assert "user: hi" in response.prompt

From 5455a9fdcd8075eaa91d8c69b8314ed13a2e9f86 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Sat, 30 May 2026 22:04:15 -0700
Subject: [PATCH 2/3] fix: simplify generation tag handling

---
 CHANGELOG.md                    |  1 +
 llama_cpp/llama_chat_format.py  | 18 ++++++++----------
 tests/test_llama_chat_format.py | 23 -----------------------
 3 files changed, 9 insertions(+), 33 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index be4ade61d..90d79abbf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat: Update llama.cpp to ggml-org/llama.cpp@d749821db
+- fix: model fails to load when chat template uses HuggingFace generation tags by @tobocop2 in #2226
 - docs: add contributing guide by @abetlen in #2229
 - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034
 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index ac8c02513..74d48a30f 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -24,7 +24,6 @@
 )
 
 import jinja2
-from jinja2 import nodes
 from jinja2.ext import Extension
 from jinja2.sandbox import ImmutableSandboxedEnvironment
 
@@ -193,17 +192,16 @@ def __call__(
     ) -> ChatFormatterResponse: ...
 
 
-class _GenerationTagIgnore(Extension):
-    """Pass-through for HuggingFace's ``{% generation %}`` chat-template tag."""
-
-    tags = {"generation"}
+class Jinja2ChatFormatter(ChatFormatter):
+    class _GenerationTagIgnore(Extension):
+        """Pass-through for HuggingFace's ``{% generation %}`` chat-template tag."""
 
-    def parse(self, parser: jinja2.parser.Parser) -> List[nodes.Node]:
-        parser.stream.skip(1)  # discard the 'generation' tag-name token
-        return parser.parse_statements(("name:endgeneration",), drop_needle=True)
+        tags = {"generation"}
 
+        def parse(self, parser: jinja2.parser.Parser):
+            parser.stream.skip(1)
+            return parser.parse_statements(("name:endgeneration",), drop_needle=True)
 
-class Jinja2ChatFormatter(ChatFormatter):
     def __init__(
         self,
         template: str,
@@ -225,7 +223,7 @@ def __init__(
             loader=jinja2.BaseLoader(),
             trim_blocks=True,
             lstrip_blocks=True,
-            extensions=[_GenerationTagIgnore],
+            extensions=[Jinja2ChatFormatter._GenerationTagIgnore],
         ).from_string(self.template)
 
     @staticmethod
diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
index 16852a472..18c7279cf 100644
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@@ -92,26 +92,3 @@ def test_hf_tokenizer_config_str_to_chat_formatter():
     )
 
     assert chat_formatter_respoonse.prompt == ("<s>[INST] Hello, world! [/INST]</s>")
-
-
-def test_generation_tag_is_ignored() -> None:
-    """HuggingFace chat templates use {% generation %}/{% endgeneration %} to
-    mark training-time loss spans. At inference the tags must be no-ops or
-    affected GGUFs (SmolLM3 and similar) fail to load with TemplateSyntaxError.
-    """
-    template = (
-        "{% for message in messages %}"
-        "{% generation %}{{ message['role'] }}: {{ message['content'] }}{% endgeneration %}"
-        "{% endfor %}"
-    )
-    chat_formatter = llama_chat_format.Jinja2ChatFormatter(
-        template=template,
-        eos_token="</s>",
-        bos_token="<s>",
-    )
-    response = chat_formatter(
-        messages=[
-            ChatCompletionRequestUserMessage(role="user", content="hi"),
-        ]
-    )
-    assert "user: hi" in response.prompt

From 3521aad221c10042aa24b9627272ecabe4c3cb91 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Sat, 30 May 2026 22:05:44 -0700
Subject: [PATCH 3/3] refactor: rename generation tag extension

---
 llama_cpp/llama_chat_format.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 74d48a30f..f24b89f3e 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -193,7 +193,7 @@ def __call__(
 
 
 class Jinja2ChatFormatter(ChatFormatter):
-    class _GenerationTagIgnore(Extension):
+    class IgnoreGenerationTags(Extension):
         """Pass-through for HuggingFace's ``{% generation %}`` chat-template tag."""
 
         tags = {"generation"}
@@ -223,7 +223,7 @@ def __init__(
             loader=jinja2.BaseLoader(),
             trim_blocks=True,
             lstrip_blocks=True,
-            extensions=[Jinja2ChatFormatter._GenerationTagIgnore],
+            extensions=[Jinja2ChatFormatter.IgnoreGenerationTags],
         ).from_string(self.template)
 
     @staticmethod