Rename skip_chat_template to use_chat_template across codebase

JerryLife · JerryLife · commit 7c518d5a8374 · 2026-03-22T13:57:49.000+08:00
Chat templates are now opt-in everywhere. Fixes inconsistency between
the API (which defaulted to skip) and the legacy extraction parser
(which defaulted to apply). Updates README accordingly.
diff --git a/README.md b/README.md
@@ -91,7 +91,7 @@ calc-dna \
 
 - **Metadata auto-fetched**: Model metadata is automatically retrieved from HuggingFace Hub and cached.
 - **Auth token**: Pass via `token=...` or set `HF_TOKEN` environment variable.
-- **Chat templates**: Applied automatically when supported by the tokenizer.
+- **Chat templates**: Disabled by default. Enable with `--use-chat-template` (CLI) or `use_chat_template=True` (API).
 
 ## Tests
 
diff --git a/src/llm_dna/api.py b/src/llm_dna/api.py
@@ -49,7 +49,7 @@ class DNAExtractionConfig:
     gpu_id: Optional[int] = None
     log_level: str = "INFO"
     random_seed: int = 42
-    skip_chat_template: bool = True
+    use_chat_template: bool = False
 
 
 @dataclass(slots=True)
@@ -413,7 +413,7 @@ def _save_response_incrementally(idx: int, prompt: str, response: str) -> None:
                 temperature=0.0,
                 do_sample=False,
                 top_p=1.0,
-                skip_chat_template=config.skip_chat_template,
+                use_chat_template=config.use_chat_template,
                 on_response_callback=_save_response_incrementally if incremental_save_path else None,
             )
         else:
@@ -425,7 +425,7 @@ def _save_response_incrementally(idx: int, prompt: str, response: str) -> None:
                     temperature=0.0,
                     do_sample=False,
                     top_p=1.0,
-                    skip_chat_template=config.skip_chat_template,
+                    use_chat_template=config.use_chat_template,
                 )
                 responses.append(response)
                 if incremental_save_path:
@@ -628,7 +628,7 @@ def calc_dna(config: DNAExtractionConfig) -> DNAExtractionResult:
             device=resolved_device,
             log_level=config.log_level,
             random_seed=config.random_seed,
-            skip_chat_template=config.skip_chat_template,
+            use_chat_template=config.use_chat_template,
         )
 
         signature = core.extract_dna_signature(
diff --git a/src/llm_dna/cli.py b/src/llm_dna/cli.py
@@ -169,7 +169,12 @@ def parse_arguments(argv: Optional[Iterable[str]] = None) -> argparse.Namespace:
         default="INFO",
     )
     parser.add_argument("--random-seed", type=int, default=42)
-    parser.add_argument("--skip-chat-template", action="store_true")
+    parser.add_argument(
+        "--use-chat-template",
+        action="store_true",
+        default=False,
+        help="Apply chat template for HuggingFace models (default: disabled).",
+    )
 
     return parser.parse_args(list(argv) if argv is not None else None)
 
@@ -221,7 +226,7 @@ def main(argv: Optional[Iterable[str]] = None) -> int:
             gpu_id=None,
             log_level=args.log_level,
             random_seed=args.random_seed,
-            skip_chat_template=args.skip_chat_template,
+            use_chat_template=args.use_chat_template,
         )
         try:
             results = calc_dna_parallel(
@@ -276,7 +281,7 @@ def main(argv: Optional[Iterable[str]] = None) -> int:
                 gpu_id=gpu_id,
                 log_level=args.log_level,
                 random_seed=args.random_seed,
-                skip_chat_template=args.skip_chat_template,
+                use_chat_template=args.use_chat_template,
             )
 
             result = calc_dna(config)
diff --git a/src/llm_dna/core/extraction.py b/src/llm_dna/core/extraction.py
@@ -206,10 +206,10 @@ def parse_arguments(argv: Optional[List[str]] = None) -> argparse.Namespace:
         help="Random seed for reproducibility"
     )
     parser.add_argument(
-        "--skip-chat-template",
+        "--use-chat-template",
         action="store_true",
         default=False,
-        help="Skip applying chat templates for chat models (treat them as completion models). By default, chat templates are applied for embedding extractor."
+        help="Apply chat templates for chat models. By default, chat templates are not applied."
     )
     
     return parser.parse_args(argv)
@@ -384,7 +384,7 @@ def extract_dna_signature(
     """Extract DNA signature from model."""
 
     # Apply chat template when available for chat-oriented tokenizers.
-    if extractor_type == "embedding" and not args.skip_chat_template:
+    if extractor_type == "embedding" and args.use_chat_template:
         is_chat_model = model_metadata.get("chat_model", {}).get("is_chat_model", False)
         should_try_template = is_chat_model or "chat_model" not in model_metadata
         try:
diff --git a/src/llm_dna/models/ModelWrapper.py b/src/llm_dna/models/ModelWrapper.py
@@ -632,7 +632,7 @@ def generate(
         temperature: float = 0.7,
         do_sample: bool = True,
         top_p: float = 0.9,
-        skip_chat_template: bool = False,
+        use_chat_template: bool = False,
         **kwargs
     ) -> str:
         """Generate text from input, respecting the model's context length."""
@@ -648,10 +648,10 @@ def generate(
             safe_input_length, max_new_tokens = self._get_safe_generation_params(max_length)
             
             # Prefer chat-template tokenization when available to ensure special tokens are handled
-            # Skip chat template if skip_chat_template=True (treat chat models as completion models)
+            # Apply chat template only when use_chat_template=True
             inputs = None
             prefers_chat_template = False
-            if not skip_chat_template:
+            if use_chat_template:
                 if self.is_chat_model is True:
                     prefers_chat_template = True
                 # Heuristic fallback if metadata wasn't provided
@@ -757,7 +757,7 @@ def generate(
             # Decode only the newly generated tokens
             new_tokens = outputs[0][input_length:]
             # When skipping chat template, preserve special tokens (match try_chat_model_without_template.py behavior)
-            skip_special_tokens = not skip_chat_template
+            skip_special_tokens = use_chat_template
             generated_text = self.tokenizer.decode(new_tokens, skip_special_tokens=skip_special_tokens)
             
             return generated_text.strip()
@@ -1035,9 +1035,9 @@ def __init__(
         except Exception as e:
             raise RuntimeError(f"Failed to initialize vLLM engine: {e}")
 
-    def _format_prompt(self, user_text: str, skip_chat_template: bool = False) -> str:
-        # Prefer chat template for chat models, unless skip_chat_template=True
-        if skip_chat_template:
+    def _format_prompt(self, user_text: str, use_chat_template: bool = False) -> str:
+        # Apply chat template for chat models only when use_chat_template=True
+        if not use_chat_template:
             return user_text
         try:
             prefers_chat = False
@@ -1062,12 +1062,12 @@ def generate(
         temperature: float = 0.7,
         do_sample: bool = True,
         top_p: float = 0.9,
-        skip_chat_template: bool = False,
+        use_chat_template: bool = False,
         **kwargs
     ) -> str:
         try:
             from vllm import SamplingParams
-            prompt = self._format_prompt(input_text, skip_chat_template=skip_chat_template)
+            prompt = self._format_prompt(input_text, use_chat_template=use_chat_template)
             # Map our "max_length" contract to vLLM's max_tokens for new tokens
             # Our safe length logic is in HF wrapper; here we approximate with max_tokens
             params = SamplingParams(
@@ -1091,7 +1091,7 @@ def generate_batch(
         temperature: float = 0.7,
         do_sample: bool = True,
         top_p: float = 0.9,
-        skip_chat_template: bool = False,
+        use_chat_template: bool = False,
         **kwargs
     ) -> List[str]:
         """Generate for a list of prompts in one vLLM call.
@@ -1103,7 +1103,7 @@ def generate_batch(
             return []
         try:
             from vllm import SamplingParams
-            formatted = [self._format_prompt(p, skip_chat_template=skip_chat_template) for p in prompts]
+            formatted = [self._format_prompt(p, use_chat_template=use_chat_template) for p in prompts]
             params = SamplingParams(
                 max_tokens=max_length,
                 temperature=temperature,
@@ -1122,7 +1122,7 @@ def generate_batch(
             self.logger.error(f"vLLM batch generation failed: {e}")
             # Fall back to sequential to salvage outputs
             return [
-                self.generate(p, max_length=max_length, temperature=temperature, do_sample=do_sample, top_p=top_p, skip_chat_template=skip_chat_template, **kwargs)
+                self.generate(p, max_length=max_length, temperature=temperature, do_sample=do_sample, top_p=top_p, use_chat_template=use_chat_template, **kwargs)
                 for p in prompts
             ]
 
@@ -1383,7 +1383,7 @@ def generate_batch(
         completion_window = str(kwargs.pop("batch_completion_window", "24h"))
 
         # Wrapper-only kwargs should not be forwarded to the provider payload.
-        kwargs.pop("skip_chat_template", None)
+        kwargs.pop("use_chat_template", None)
 
         if not prefer_batch_api:
             return super().generate_batch(
@@ -1979,7 +1979,7 @@ def generate(
         top_p: float = 0.9,
         **kwargs
     ) -> str:
-        kwargs.pop("skip_chat_template", None)
+        kwargs.pop("use_chat_template", None)
         payload = {
             "contents": [{"role": "user", "parts": [{"text": input_text}]}],
             "generationConfig": self._build_gemini_generation_config(
@@ -2022,7 +2022,7 @@ def generate_batch(
         timeout = kwargs.pop("batch_timeout_seconds", self.batch_timeout_seconds)
 
         # Wrapper-only kwarg
-        kwargs.pop("skip_chat_template", None)
+        kwargs.pop("use_chat_template", None)
 
         if not prefer_batch_api:
             return super().generate_batch(