From 38265ee3c783a6c157a51654c6c797245ce1b612 Mon Sep 17 00:00:00 2001 From: Tobias Perelstein <5562156+tobocop2@users.noreply.github.com> Date: Thu, 21 May 2026 23:36:00 -0400 Subject: [PATCH 1/3] chat-format: ignore HuggingFace's {% generation %} chat-template tag HuggingFace's transformers chat-template extension adds {% generation %} and {% endgeneration %} tags so trainers can mark generation spans for loss masking. The tags ship in GGUF tokenizer.chat_template metadata (SmolLM3 et al), but jinja2's default environment doesn't recognize them, so Llama() raises TemplateSyntaxError at init for any affected GGUF, even when the caller passes an explicit chat_format override. Register a minimal Jinja extension that treats both tags as inert wrappers: the body between them renders as-is, the markers themselves emit nothing. No behavioral change for templates that don't use the tags. Prior art: PR #2082 attempted the same approach but referenced an unimported 'nodes' module and didn't consume the body or closing tag. --- llama_cpp/llama_chat_format.py | 13 +++++++++++++ tests/test_llama_chat_format.py | 23 +++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 1024fb85b..ac8c02513 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -24,6 +24,8 @@ ) import jinja2 +from jinja2 import nodes +from jinja2.ext import Extension from jinja2.sandbox import ImmutableSandboxedEnvironment import numpy as np @@ -191,6 +193,16 @@ def __call__( ) -> ChatFormatterResponse: ... +class _GenerationTagIgnore(Extension): + """Pass-through for HuggingFace's ``{% generation %}`` chat-template tag.""" + + tags = {"generation"} + + def parse(self, parser: jinja2.parser.Parser) -> List[nodes.Node]: + parser.stream.skip(1) # discard the 'generation' tag-name token + return parser.parse_statements(("name:endgeneration",), drop_needle=True) + + class Jinja2ChatFormatter(ChatFormatter): def __init__( self, @@ -213,6 +225,7 @@ def __init__( loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True, + extensions=[_GenerationTagIgnore], ).from_string(self.template) @staticmethod diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index 18c7279cf..16852a472 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -92,3 +92,26 @@ def test_hf_tokenizer_config_str_to_chat_formatter(): ) assert chat_formatter_respoonse.prompt == ("[INST] Hello, world! [/INST]") + + +def test_generation_tag_is_ignored() -> None: + """HuggingFace chat templates use {% generation %}/{% endgeneration %} to + mark training-time loss spans. At inference the tags must be no-ops or + affected GGUFs (SmolLM3 and similar) fail to load with TemplateSyntaxError. + """ + template = ( + "{% for message in messages %}" + "{% generation %}{{ message['role'] }}: {{ message['content'] }}{% endgeneration %}" + "{% endfor %}" + ) + chat_formatter = llama_chat_format.Jinja2ChatFormatter( + template=template, + eos_token="", + bos_token="", + ) + response = chat_formatter( + messages=[ + ChatCompletionRequestUserMessage(role="user", content="hi"), + ] + ) + assert "user: hi" in response.prompt From 5455a9fdcd8075eaa91d8c69b8314ed13a2e9f86 Mon Sep 17 00:00:00 2001 From: abetlen Date: Sat, 30 May 2026 22:04:15 -0700 Subject: [PATCH 2/3] fix: simplify generation tag handling --- CHANGELOG.md | 1 + llama_cpp/llama_chat_format.py | 18 ++++++++---------- tests/test_llama_chat_format.py | 23 ----------------------- 3 files changed, 9 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be4ade61d..90d79abbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat: Update llama.cpp to ggml-org/llama.cpp@d749821db +- fix: model fails to load when chat template uses HuggingFace generation tags by @tobocop2 in #2226 - docs: add contributing guide by @abetlen in #2229 - chore: Migrate llama.cpp submodule URL to ggml-org/llama.cpp by @shalinib-ibm in #2034 - fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ac8c02513..74d48a30f 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -24,7 +24,6 @@ ) import jinja2 -from jinja2 import nodes from jinja2.ext import Extension from jinja2.sandbox import ImmutableSandboxedEnvironment @@ -193,17 +192,16 @@ def __call__( ) -> ChatFormatterResponse: ... -class _GenerationTagIgnore(Extension): - """Pass-through for HuggingFace's ``{% generation %}`` chat-template tag.""" - - tags = {"generation"} +class Jinja2ChatFormatter(ChatFormatter): + class _GenerationTagIgnore(Extension): + """Pass-through for HuggingFace's ``{% generation %}`` chat-template tag.""" - def parse(self, parser: jinja2.parser.Parser) -> List[nodes.Node]: - parser.stream.skip(1) # discard the 'generation' tag-name token - return parser.parse_statements(("name:endgeneration",), drop_needle=True) + tags = {"generation"} + def parse(self, parser: jinja2.parser.Parser): + parser.stream.skip(1) + return parser.parse_statements(("name:endgeneration",), drop_needle=True) -class Jinja2ChatFormatter(ChatFormatter): def __init__( self, template: str, @@ -225,7 +223,7 @@ def __init__( loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True, - extensions=[_GenerationTagIgnore], + extensions=[Jinja2ChatFormatter._GenerationTagIgnore], ).from_string(self.template) @staticmethod diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index 16852a472..18c7279cf 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -92,26 +92,3 @@ def test_hf_tokenizer_config_str_to_chat_formatter(): ) assert chat_formatter_respoonse.prompt == ("[INST] Hello, world! [/INST]") - - -def test_generation_tag_is_ignored() -> None: - """HuggingFace chat templates use {% generation %}/{% endgeneration %} to - mark training-time loss spans. At inference the tags must be no-ops or - affected GGUFs (SmolLM3 and similar) fail to load with TemplateSyntaxError. - """ - template = ( - "{% for message in messages %}" - "{% generation %}{{ message['role'] }}: {{ message['content'] }}{% endgeneration %}" - "{% endfor %}" - ) - chat_formatter = llama_chat_format.Jinja2ChatFormatter( - template=template, - eos_token="", - bos_token="", - ) - response = chat_formatter( - messages=[ - ChatCompletionRequestUserMessage(role="user", content="hi"), - ] - ) - assert "user: hi" in response.prompt From 3521aad221c10042aa24b9627272ecabe4c3cb91 Mon Sep 17 00:00:00 2001 From: abetlen Date: Sat, 30 May 2026 22:05:44 -0700 Subject: [PATCH 3/3] refactor: rename generation tag extension --- llama_cpp/llama_chat_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 74d48a30f..f24b89f3e 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -193,7 +193,7 @@ def __call__( class Jinja2ChatFormatter(ChatFormatter): - class _GenerationTagIgnore(Extension): + class IgnoreGenerationTags(Extension): """Pass-through for HuggingFace's ``{% generation %}`` chat-template tag.""" tags = {"generation"} @@ -223,7 +223,7 @@ def __init__( loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True, - extensions=[Jinja2ChatFormatter._GenerationTagIgnore], + extensions=[Jinja2ChatFormatter.IgnoreGenerationTags], ).from_string(self.template) @staticmethod