Automatically infer whether to use a chat template or not instead of using kwargs (#885)

clefourrier · web-flow · commit e7d70902bf47 · 2025-08-01T10:20:28.000+02:00
* chat_template automatically inferred

* single system

* fix

* added unit tests + integ test

* mock the model creation phase to go faster
diff --git a/community_tasks/oz_evals.py b/community_tasks/oz_evals.py
@@ -30,8 +30,6 @@
 Data consists of 1k+ high-quality questions and answers which were used as part of entry exams at the Faculty of Philosophy and Faculty of Organizational Sciences, University of Belgrade.
 The exams test the General Knowledge of students and were used in the enrollment periods from 2003 to 2024.
 For more details and results see: https://huggingface.co/datasets/DjMel/oz-eval
-
-In order to have comparable results to ours, please do not forget to run with --use_chat_template
 """
 
 from lighteval.metrics.metrics import Metrics
diff --git a/community_tasks/serbian_eval.py b/community_tasks/serbian_eval.py
@@ -137,9 +137,7 @@ def prompt_fn_oz_eval_task(line, task_name: str = None):
                     or if 'answer_str' is not one of ["A", "B", "C", "D", "E"].
 
     Note:
-        This function is part of the LightEval setup, specifically for loading OZ Eval dataset questions
-        into the evaluation environment. For consistent evaluation results, run the task with
-        `--use_chat_template`. The OZ Eval dataset is available at https://huggingface.co/datasets/DjMel/oz-eval.
+        The OZ Eval dataset is available at https://huggingface.co/datasets/DjMel/oz-eval.
 
     """
     query_template = """Pitanje: {question}\n
diff --git a/docs/source/use-litellm-as-backend.mdx b/docs/source/use-litellm-as-backend.mdx
@@ -12,12 +12,8 @@ Documentation for available APIs and compatible endpoints can be found [here](ht
 lighteval endpoint litellm \
     "provider=openai,model_name=gpt-3.5-turbo" \
     "lighteval|gsm8k|0|0" \
-    --use-chat-template
 ```
 
-> [!WARNING]
-> `--use-chat-template` is required for litellm to work properly.
-
 ## Using a config file
 
 Litellm allows generation with any OpenAI compatible endpoint, for example you
diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx
@@ -52,7 +52,6 @@ model_parameters:
     context_length: null
     random_seed: 1
     trust_remote_code: False
-    use_chat_template: False
     device: "cuda"
     skip_tokenizer_init: False
     kv_cache_dtype: "auto"
diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
@@ -57,7 +57,6 @@ model_parameters:
     swap_space: 4
     seed: 1
     trust_remote_code: True
-    use_chat_template: True
     add_special_tokens: True
     multichoice_continuations_start_space: True
     pairwise_tokenization: True
@@ -99,7 +98,6 @@ model_parameters:
     swap_space: 4
     seed: 1
     trust_remote_code: True
-    use_chat_template: True
     add_special_tokens: True
     multichoice_continuations_start_space: True
     pairwise_tokenization: True
diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx
@@ -39,7 +39,6 @@ def main():
     model_config = VLLMModelConfig(
             model_name="HuggingFaceH4/zephyr-7b-beta",
             dtype="float16",
-            use_chat_template=True,
     )
 
     task = "helm|mmlu|5|1"
diff --git a/examples/model_configs/peft_model.yaml b/examples/model_configs/peft_model.yaml
@@ -7,10 +7,8 @@ model_parameters:
   revision: "main" # revision to use
   trust_remote_code: true # Trust remote code
   model_parallel: null # Model parallel
-  use_chat_template: true # Use chat template
   max_length: 2048 # maximum length of the input text and the generated text
 
   # should go in generation
   max_generation_toks: 256 # maximum number of tokens to generate
-    #use_chat_template: true # Use chat template
   batch_size: 10 # batch size to use
diff --git a/examples/model_configs/sglang_model_config.yaml b/examples/model_configs/sglang_model_config.yaml
@@ -6,7 +6,6 @@ model_parameters:
   context_length: null
   random_seed: 1
   trust_remote_code: False
-  use_chat_template: True
   device: "cuda"
   skip_tokenizer_init: False
   kv_cache_dtype: "auto"
diff --git a/examples/model_configs/transformers_model.yaml b/examples/model_configs/transformers_model.yaml
@@ -6,7 +6,6 @@ model_parameters:
   model_parallel: false
   batch_size: 1
   multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing
-  use_chat_template: true
   generation_parameters:
     temperature: 0.0
     top_p: 0.9
diff --git a/examples/model_configs/transformers_vlm_model.yaml b/examples/model_configs/transformers_vlm_model.yaml
@@ -6,7 +6,6 @@ model_parameters:
   model_parallel: false
   batch_size: 1
   use_fast_image_processor: true
-  use_chat_template: true
   generation_parameters:
     temperature: 0.0
     top_p: 0.9
diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml
@@ -10,7 +10,6 @@ model_parameters:
   swap_space: 4
   seed: 42
   trust_remote_code: False
-  use_chat_template: True
   add_special_tokens: True
   multichoice_continuations_start_space: False
   pairwise_tokenization: False
diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
@@ -31,7 +31,7 @@
 from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset
 from lighteval.models.abstract_model import LightevalModel, ModelInfo
 from lighteval.models.model_output import ModelResponse
-from lighteval.models.utils import ModelConfig, _simplify_name
+from lighteval.models.utils import ModelConfig, _simplify_name, uses_chat_template
 from lighteval.tasks.prompt_manager import PromptManager
 from lighteval.tasks.requests import Doc
 from lighteval.utils.imports import is_sglang_available
@@ -76,8 +76,6 @@ class SGLangModelConfig(ModelConfig):
             Random seed for reproducibility. Defaults to 1234.
         trust_remote_code (bool):
             Whether to trust remote code when loading models. Defaults to False.
-        use_chat_template (bool):
-            Whether to use chat templates for conversation-style prompts. Defaults to False.
         device (str):
             Device to load the model on. Defaults to "cuda".
         skip_tokenizer_init (bool):
@@ -119,7 +117,6 @@ class SGLangModelConfig(ModelConfig):
     context_length: PositiveInt | None = None
     random_seed: PositiveInt | None = 1234
     trust_remote_code: bool = False
-    use_chat_template: bool = False
     device: str = "cuda"
     skip_tokenizer_init: bool = False
     kv_cache_dtype: str = "auto"
@@ -136,9 +133,9 @@ def __init__(
         self,
         config: SGLangModelConfig,
     ):
-        """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation."""
+        """Initializes an SGLang model."""
         self._config = config
-        self.use_chat_template = config.use_chat_template
+        self.use_chat_template = uses_chat_template(model_name=self._config.model_name)
         self.data_parallel_size = config.dp_size
         self.tensor_parallel_size = config.tp_size
         self._add_special_tokens = config.add_special_tokens
diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py
@@ -50,7 +50,7 @@
     Batch,
     ModelResponse,
 )
-from lighteval.models.utils import ModelConfig, _get_dtype, _get_model_sha, _simplify_name
+from lighteval.models.utils import ModelConfig, _get_dtype, _get_model_sha, _simplify_name, uses_chat_template
 from lighteval.tasks.prompt_manager import PromptManager
 from lighteval.tasks.requests import Doc
 from lighteval.utils.imports import (
@@ -101,8 +101,6 @@ class TransformersModelConfig(ModelConfig):
             Device to load the model on. Can be "cuda", "cpu", or GPU index. Defaults to "cuda".
         trust_remote_code (bool):
             Whether to trust remote code when loading models. Defaults to False.
-        use_chat_template (bool):
-            Whether to use chat templates for conversation-style prompts. Defaults to False.
         compile (bool):
             Whether to compile the model using torch.compile for optimization. Defaults to False.
         multichoice_continuations_start_space (bool | None):
@@ -117,7 +115,6 @@ class TransformersModelConfig(ModelConfig):
             model_name="meta-llama/Llama-3.1-8B-Instruct",
             batch_size=4,
             dtype="float16",
-            use_chat_template=True,
             generation_parameters=GenerationParameters(
                 temperature=0.7,
                 max_new_tokens=100
@@ -143,7 +140,6 @@ class TransformersModelConfig(ModelConfig):
     dtype: str | None = None
     device: Union[int, str] = "cuda"
     trust_remote_code: bool = False
-    use_chat_template: bool = False
     compile: bool = False
     multichoice_continuations_start_space: bool | None = None
     pairwise_tokenization: bool = False
@@ -185,7 +181,6 @@ def __init__(
         self.config = config
         self.accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
         self._device = self.accelerator.device
-        self.use_chat_template = config.use_chat_template
         self.multichoice_continuations_start_space = config.multichoice_continuations_start_space
         self._add_special_tokens = config.add_special_tokens or False
         self.pairwise_tokenization = config.pairwise_tokenization
@@ -195,6 +190,7 @@ def __init__(
         self.model_sha = config.get_model_sha()
         self._max_length = self._init_max_length()
         self._tokenizer = self._create_auto_tokenizer()
+        self.use_chat_template = uses_chat_template(tokenizer=self._tokenizer)
         self.model = self._create_auto_model()
 
         # We are in DP (and launch the script with `accelerate launch`)
@@ -277,7 +273,7 @@ def from_model(
         else:
             self._device = self.config.device
 
-        self.use_chat_template = config.use_chat_template if config else False
+        self.use_chat_template = uses_chat_template(self._tokenizer)
         self._add_special_tokens = add_special_tokens if add_special_tokens is not None else False
         self.pairwise_tokenization = pairwise_tokenization
         self.multichoice_continuations_start_space = multichoice_continuations_start_space
diff --git a/src/lighteval/models/transformers/vlm_transformers_model.py b/src/lighteval/models/transformers/vlm_transformers_model.py
@@ -117,7 +117,6 @@ class VLMTransformersModelConfig(ModelConfig):
     dtype: str | None = None
     device: Union[int, str] = "cuda"
     trust_remote_code: bool = False
-    use_chat_template: bool = False
     compile: bool = False
     device_map: str | None = None
 
@@ -147,7 +146,7 @@ def __init__(
 
         # Config attributes
         self.config = config
-        self.use_chat_template = config.use_chat_template
+        self.use_chat_template = True
         self.batch_size = config.batch_size
 
         # Model, config, and processor
diff --git a/src/lighteval/models/utils.py b/src/lighteval/models/utils.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 import json
+import logging
 import os
 import re
 from itertools import islice
@@ -30,11 +31,15 @@
 import yaml
 from huggingface_hub import HfApi
 from pydantic import BaseModel
+from transformers import AutoTokenizer
 from transformers.models.auto.configuration_auto import AutoConfig
 
 from lighteval.models.model_input import GenerationParameters
 
 
+logger = logging.getLogger(__name__)
+
+
 class ModelConfig(BaseModel, extra="forbid"):
     """
     Base configuration class for all model types in Lighteval.
@@ -216,3 +221,28 @@ def batched(iterable, n):
     it = iter(iterable)
     while batch := tuple(islice(it, n)):
         yield batch
+
+
+def uses_chat_template(model_name: str = None, tokenizer: AutoTokenizer = None) -> bool:
+    """Returns a boolean depending on whether the Transformers AutoTokenizer contains
+    a chat template or not
+
+    Args:
+        model_name (str): Model name on HF
+
+    Returns:
+        bool: True if Tokenizer config contains a chat template, False otherwise
+    """
+    if model_name is None and tokenizer is None:
+        raise Exception("`uses_chat_template` requires either a tokenizer or model name as input")
+    try:
+        if tokenizer:
+            tk = tokenizer
+        else:
+            tk = AutoTokenizer.from_pretrained(model_name)
+        return tk.chat_template is not None
+    except Exception:
+        logger.warning(
+            "We were not able to detect if the chat template should be used for your model: {e}. Assuming we're using a chat template"
+        )
+        return True
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -34,7 +34,7 @@
 from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset
 from lighteval.models.abstract_model import LightevalModel, ModelInfo
 from lighteval.models.model_output import ModelResponse
-from lighteval.models.utils import ModelConfig, _simplify_name
+from lighteval.models.utils import ModelConfig, _simplify_name, uses_chat_template
 from lighteval.tasks.prompt_manager import PromptManager
 from lighteval.tasks.requests import Doc
 from lighteval.utils.imports import is_vllm_available
@@ -122,8 +122,6 @@ class VLLMModelConfig(ModelConfig):
             Maximum number of tokens per batch. Defaults to 2048.
         subfolder (str | None):
             Subfolder within the model repository. Defaults to None.
-        use_chat_template (bool):
-            Whether to use chat templates for conversation-style prompts. Defaults to False.
         is_async (bool):
             Whether to use the async version of VLLM. Defaults to False.
 
@@ -165,7 +163,6 @@ class VLLMModelConfig(ModelConfig):
     max_num_seqs: PositiveInt = 128  # maximum number of sequences per iteration; This variable and `max_num_batched_tokens` effectively control the batch size at prefill stage. See https://github.com/vllm-project/vllm/issues/2492 for detailed explaination.
     max_num_batched_tokens: PositiveInt = 2048  # maximum number of tokens per batch
     subfolder: str | None = None
-    use_chat_template: bool = False
     is_async: bool = False  # Whether to use the async version or sync version of the model
 
 
@@ -176,7 +173,7 @@ def __init__(
     ):
         """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation."""
         self._config = config
-        self.use_chat_template = config.use_chat_template
+        self.use_chat_template = uses_chat_template(model_name=config.model_name)
         self.data_parallel_size = config.data_parallel_size
         self.tensor_parallel_size = config.tensor_parallel_size
         self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False
diff --git a/tests/models/test_model_utils.py b/tests/models/test_model_utils.py
@@ -0,0 +1,44 @@
+# MIT License
+
+# Copyright (c) 2025 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import unittest
+from unittest.mock import Mock
+
+from lighteval.models.utils import uses_chat_template
+
+
+class TestUseChatTemplate(unittest.TestCase):
+    def test_uses_chat_template_with_chat_template_present(self):
+        """Test that uses_chat_template returns True when tokenizer has a chat template."""
+        mock_tokenizer = Mock()
+        mock_tokenizer.chat_template = "{% for message in messages %}..."
+
+        result = uses_chat_template(tokenizer=mock_tokenizer)
+        self.assertTrue(result)
+
+    def test_uses_chat_template_with_no_chat_template(self):
+        """Test that uses_chat_template returns False when tokenizer has no chat template."""
+        mock_tokenizer = Mock()
+        mock_tokenizer.chat_template = None
+
+        result = uses_chat_template(tokenizer=mock_tokenizer)
+        self.assertFalse(result)
diff --git a/tests/models/vllm/test_vllm_model.py b/tests/models/vllm/test_vllm_model.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 import unittest
+from unittest.mock import Mock, patch
 
 from transformers import AutoTokenizer
 
@@ -38,3 +39,24 @@ def test_tokenizer_created_with_correct_revision(self):
             revision=config.revision,
         )
         self.assertEqual(vllm_tokenizer.chat_template, tokenizer.chat_template)
+
+
+class TestVLLMModelUseChatTemplate(unittest.TestCase):
+    @patch("lighteval.models.vllm.vllm_model.VLLMModel._create_auto_model")
+    def test_vllm_model_use_chat_template_with_different_model_names(self, mock_create_model):
+        """Test that VLLMModel correctly calls uses_chat_template with different model names."""
+        test_cases = [
+            ("Qwen/Qwen3-0.6B", True),
+            ("gpt2", False),
+        ]
+
+        for model_name, expected_result in test_cases:
+            with self.subTest(model_name=model_name):
+                # We skip the model creation phase
+                mock_create_model.return_value = Mock()
+
+                config = VLLMModelConfig(model_name=model_name)
+                model = VLLMModel(config)
+
+                self.assertEqual(model.use_chat_template, expected_result)
+                self.assertEqual(model.use_chat_template, model._tokenizer.chat_template is not None)
diff --git a/tests/slow_tests/test_accelerate_model.py b/tests/slow_tests/test_accelerate_model.py
@@ -35,7 +35,6 @@
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 
 MODELS_ARGS = [
-    # {"model_name": "gpt2", "use_chat_template": False, "revision": "main", "results_file": "tests/reference_scores/gpt2-results.json"},
     {
         "model_name": "examples/model_configs/transformers_model.yaml",
         "results_file": "tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json",
diff --git a/tests/slow_tests/test_vllm_model.py b/tests/slow_tests/test_vllm_model.py

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,6 @@ def main():`
`39`	`39`	`model_config = VLLMModelConfig(`
`40`	`40`	`model_name="HuggingFaceH4/zephyr-7b-beta",`
`41`	`41`	`dtype="float16",`
`42`		`- use_chat_template=True,`
`43`	`42`	`)`
`44`	`43`
`45`	`44`	`task = "helm\|mmlu\|5\|1"`