enhance apply_chat_template (#2513)

SdeeRK · Jonathans575 · web-flow · commit 23764afeb0d5 · 2025-08-29T11:05:55.000+08:00
Co-authored-by: Jonathans575 &lt;1563710292@qq.com&gt;
diff --git a/paddleformers/transformers/auto/tokenizer.py b/paddleformers/transformers/auto/tokenizer.py
@@ -24,7 +24,6 @@
     resolve_trust_remote_code,
 )
 from transformers.modeling_gguf_pytorch_utils import load_gguf_checkpoint
-from transformers.models import EncoderDecoderConfig
 from transformers.models.auto.configuration_auto import (
     config_class_to_model_type,
     replace_list_option_in_docstrings,
@@ -35,6 +34,9 @@
     get_tokenizer_config,
     tokenizer_class_from_name,
 )
+from transformers.models.encoder_decoder.configuration_encoder_decoder import (
+    EncoderDecoderConfig,
+)
 from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
 from transformers.utils import cached_file
 
diff --git a/paddleformers/transformers/tokenizer_utils.py b/paddleformers/transformers/tokenizer_utils.py
@@ -19,7 +19,7 @@
 import os
 import re
 from functools import wraps
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 from transformers import BatchEncoding
 from transformers.tokenization_utils import (
@@ -156,6 +156,61 @@ def wrapper(*args, **kwargs):
 
         setattr(self, method_name, wrapper)
 
+    def apply_chat_template(
+        self,
+        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]], dict[str, Any]],
+        chat_template: Optional[str] = None,
+        **kwargs,
+    ):
+        """Applies chat template to conversation data (supports 3 formats):
+
+        1. Standard chat format:
+        [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi! How can I help?"}
+        ]
+
+        2. Batch Conversation Format:
+        [
+            [{"role": "user", "content": "user messages"}, {"role": "assistant", "content": "assistant messages"}],
+            [{"role": "user", "content": "user messages"}]
+        ]
+
+        3. Enhanced dictionary format (not natively supported by HuggingFace):
+        {
+            "messages": [
+                {"role": "user", "content": "Query"},
+                {"role": "assistant", "content": "Response"}
+            ],
+            "tools": [],    # Function call definitions
+            "documents": [] # RAG context documents
+        }
+        """
+        if isinstance(conversation, dict):
+            messages = conversation.get("messages", None)
+            tools = conversation.get("tools", None)
+            documents = conversation.get("documents", None)
+
+            # Allow kwargs override for empty values
+            if not tools and "tools" in kwargs:
+                tools = kwargs.pop("tools")
+            if not documents and "documents" in kwargs:
+                documents = kwargs.pop("documents")
+
+            return super().apply_chat_template(
+                conversation=messages,
+                chat_template=chat_template,
+                tools=tools,
+                documents=documents,
+                **kwargs,
+            )
+        else:
+            return super().apply_chat_template(
+                conversation=conversation,
+                chat_template=chat_template,
+                **kwargs,
+            )
+
     # Rewrite hf's tokenizer function from_pretrained
     @classmethod
     def from_pretrained(