BerriAI
diff --git a/‎litellm/litellm_core_utils/prompt_templates/factory.py‎
Lines changed: 201 additions & 86 deletions b/‎litellm/litellm_core_utils/prompt_templates/factory.py‎
Lines changed: 201 additions & 86 deletions
@@ -2,7 +2,6 @@
 import json
 import mimetypes
 import re
-from litellm._uuid import uuid
 import xml.etree.ElementTree as ET
 from enum import Enum
 from typing import Any, List, Optional, Tuple, cast, overload
@@ -13,6 +12,7 @@
 import litellm.types
 import litellm.types.llms
 from litellm import verbose_logger
+from litellm._uuid import uuid
 from litellm.llms.custom_httpx.http_handler import HTTPHandler, get_async_httpx_client
 from litellm.types.files import get_file_extension_from_mime_type
 from litellm.types.llms.anthropic import *
@@ -364,62 +364,20 @@ def phind_codellama_pt(messages):
     return prompt
 
 
-def hf_chat_template(  # noqa: PLR0915
-    model: str, messages: list, chat_template: Optional[Any] = None
-):
-    # Define Jinja2 environment
-    env = ImmutableSandboxedEnvironment()
-
-    def raise_exception(message):
-        raise Exception(f"Error message - {message}")
-
-    # Create a template object from the template text
-    env.globals["raise_exception"] = raise_exception
-
-    ## get the tokenizer config from huggingface
-    bos_token = ""
-    eos_token = ""
-    if chat_template is None:
-
-        def _get_tokenizer_config(hf_model_name):
-            try:
-                url = f"https://huggingface.co/{hf_model_name}/raw/main/tokenizer_config.json"
-                # Make a GET request to fetch the JSON data
-                client = HTTPHandler(concurrent_limit=1)
-
-                response = client.get(url)
-            except Exception as e:
-                raise e
-            if response.status_code == 200:
-                # Parse the JSON data
-                tokenizer_config = json.loads(response.content)
-                return {"status": "success", "tokenizer": tokenizer_config}
-            else:
-                return {"status": "failure"}
-
-        if model in litellm.known_tokenizer_config:
-            tokenizer_config = litellm.known_tokenizer_config[model]
-        else:
-            tokenizer_config = _get_tokenizer_config(model)
-            litellm.known_tokenizer_config.update({model: tokenizer_config})
-
-        if (
-            tokenizer_config["status"] == "failure"
-            or "chat_template" not in tokenizer_config["tokenizer"]
-        ):
-            raise Exception("No chat template found")
-        ## read the bos token, eos token and chat template from the json
-        tokenizer_config = tokenizer_config["tokenizer"]  # type: ignore
-
-        bos_token = tokenizer_config["bos_token"]  # type: ignore
-        if bos_token is not None and not isinstance(bos_token, str):
-            if isinstance(bos_token, dict):
-                bos_token = bos_token.get("content", None)
-        eos_token = tokenizer_config["eos_token"]  # type: ignore
-        if eos_token is not None and not isinstance(eos_token, str):
-            if isinstance(eos_token, dict):
-                eos_token = eos_token.get("content", None)
-        chat_template = tokenizer_config["chat_template"]  # type: ignore
+def _render_chat_template(env, chat_template: str, bos_token: str, eos_token: str, messages: list) -> str:
+    """
+    Shared template rendering logic for both sync and async hf_chat_template
+    
+    Args:
+        env: Jinja2 environment
+        chat_template: Chat template string
+        bos_token: Beginning of sequence token
+        eos_token: End of sequence token
+        messages: Messages to render
+        
+    Returns:
+        Rendered template string
+    """
     try:
         template = env.from_string(chat_template)  # type: ignore
     except Exception as e:
@@ -434,7 +392,6 @@ def _is_system_in_template():
                 bos_token="<bos>",
             )
             return True
-
         # This will be raised if Jinja attempts to render the system message and it can't
         except Exception:
             return False
@@ -468,7 +425,7 @@ def _is_system_in_template():
                 )
             except Exception as e:
                 if "Conversation roles must alternate user/assistant" in str(e):
-                    # reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, add a blank 'user' or 'assistant' message to ensure compatibility
+                    # reformat messages to ensure user/assistant are alternating
                     new_messages = []
                     for i in range(len(reformatted_messages) - 1):
                         new_messages.append(reformatted_messages[i])
@@ -494,6 +451,188 @@ def _is_system_in_template():
         )  # don't use verbose_logger.exception, if exception is raised
 
 
+async def _afetch_and_extract_template(
+    model: str, chat_template: Optional[Any], get_config_fn, get_template_fn
+) -> Tuple[str, str, str]:
+    """
+    Async version: Fetch template and tokens from HuggingFace.
+    
+    Returns: (chat_template, bos_token, eos_token)
+    """
+    from litellm.litellm_core_utils.prompt_templates.huggingface_template_handler import (
+        _extract_token_value,
+    )
+
+    bos_token = ""
+    eos_token = ""
+
+    if chat_template is None:
+        # Fetch or retrieve cached tokenizer config
+        if model in litellm.known_tokenizer_config:
+            tokenizer_config = litellm.known_tokenizer_config[model]
+        else:
+            tokenizer_config = await get_config_fn(hf_model_name=model)
+            litellm.known_tokenizer_config.update({model: tokenizer_config})
+
+        # Try to get chat template from tokenizer_config.json first
+        if (
+            tokenizer_config.get("status") == "success"
+            and "tokenizer" in tokenizer_config
+            and isinstance(tokenizer_config["tokenizer"], dict)
+            and "chat_template" in tokenizer_config["tokenizer"]
+        ):
+            tokenizer_data: dict = tokenizer_config["tokenizer"]  # type: ignore
+            bos_token = _extract_token_value(
+                token_value=tokenizer_data.get("bos_token")
+            )
+            eos_token = _extract_token_value(
+                token_value=tokenizer_data.get("eos_token")
+            )
+            chat_template = tokenizer_data["chat_template"]
+        else:
+            # Fallback: Try to fetch chat template from separate .jinja file
+            template_result = await get_template_fn(hf_model_name=model)
+            if template_result.get("status") == "success":
+                chat_template = template_result["chat_template"]
+                # Still try to get tokens from tokenizer_config if available
+                if (
+                    tokenizer_config.get("status") == "success"
+                    and "tokenizer" in tokenizer_config
+                    and isinstance(tokenizer_config["tokenizer"], dict)
+                ):
+                    tokenizer_data: dict = tokenizer_config["tokenizer"]  # type: ignore
+                    bos_token = _extract_token_value(
+                        token_value=tokenizer_data.get("bos_token")
+                    )
+                    eos_token = _extract_token_value(
+                        token_value=tokenizer_data.get("eos_token")
+                    )
+            else:
+                raise Exception("No chat template found")
+
+    return chat_template, bos_token, eos_token  # type: ignore
+
+
+def _fetch_and_extract_template(
+    model: str, chat_template: Optional[Any], get_config_fn, get_template_fn
+) -> Tuple[str, str, str]:
+    """
+    Sync version: Fetch template and tokens from HuggingFace.
+    
+    Returns: (chat_template, bos_token, eos_token)
+    """
+    from litellm.litellm_core_utils.prompt_templates.huggingface_template_handler import (
+        _extract_token_value,
+    )
+
+    bos_token = ""
+    eos_token = ""
+
+    if chat_template is None:
+        # Fetch or retrieve cached tokenizer config
+        if model in litellm.known_tokenizer_config:
+            tokenizer_config = litellm.known_tokenizer_config[model]
+        else:
+            tokenizer_config = get_config_fn(hf_model_name=model)
+            litellm.known_tokenizer_config.update({model: tokenizer_config})
+
+        # Try to get chat template from tokenizer_config.json first
+        if (
+            tokenizer_config.get("status") == "success"
+            and "tokenizer" in tokenizer_config
+            and isinstance(tokenizer_config["tokenizer"], dict)
+            and "chat_template" in tokenizer_config["tokenizer"]
+        ):
+            tokenizer_data: dict = tokenizer_config["tokenizer"]  # type: ignore
+            bos_token = _extract_token_value(
+                token_value=tokenizer_data.get("bos_token")
+            )
+            eos_token = _extract_token_value(
+                token_value=tokenizer_data.get("eos_token")
+            )
+            chat_template = tokenizer_data["chat_template"]
+        else:
+            # Fallback: Try to fetch chat template from separate .jinja file
+            template_result = get_template_fn(hf_model_name=model)
+            if template_result.get("status") == "success":
+                chat_template = template_result["chat_template"]
+                # Still try to get tokens from tokenizer_config if available
+                if (
+                    tokenizer_config.get("status") == "success"
+                    and "tokenizer" in tokenizer_config
+                    and isinstance(tokenizer_config["tokenizer"], dict)
+                ):
+                    tokenizer_data: dict = tokenizer_config["tokenizer"]  # type: ignore
+                    bos_token = _extract_token_value(
+                        token_value=tokenizer_data.get("bos_token")
+                    )
+                    eos_token = _extract_token_value(
+                        token_value=tokenizer_data.get("eos_token")
+                    )
+            else:
+                raise Exception("No chat template found")
+
+    return chat_template, bos_token, eos_token  # type: ignore
+
+
+async def ahf_chat_template(
+    model: str, messages: list, chat_template: Optional[Any] = None
+):
+    """HuggingFace chat template (async version)"""
+    from litellm.litellm_core_utils.prompt_templates.huggingface_template_handler import (
+        _aget_chat_template_file,
+        _aget_tokenizer_config,
+        strftime_now,
+    )
+
+    env = ImmutableSandboxedEnvironment()
+    env.globals["raise_exception"] = lambda msg: Exception(f"Error message - {msg}")
+    env.globals["strftime_now"] = strftime_now
+
+    template, bos_token, eos_token = await _afetch_and_extract_template(
+        model=model,
+        chat_template=chat_template,
+        get_config_fn=_aget_tokenizer_config,
+        get_template_fn=_aget_chat_template_file,
+    )
+    return _render_chat_template(
+        env=env,
+        chat_template=template,
+        bos_token=bos_token,
+        eos_token=eos_token,
+        messages=messages,
+    )
+
+
+def hf_chat_template(
+    model: str, messages: list, chat_template: Optional[Any] = None
+):
+    """HuggingFace chat template (sync version)"""
+    from litellm.litellm_core_utils.prompt_templates.huggingface_template_handler import (
+        _get_chat_template_file,
+        _get_tokenizer_config,
+        strftime_now,
+    )
+
+    env = ImmutableSandboxedEnvironment()
+    env.globals["raise_exception"] = lambda msg: Exception(f"Error message - {msg}")
+    env.globals["strftime_now"] = strftime_now
+
+    template, bos_token, eos_token = _fetch_and_extract_template(
+        model=model,
+        chat_template=chat_template,
+        get_config_fn=_get_tokenizer_config,
+        get_template_fn=_get_chat_template_file,
+    )
+    return _render_chat_template(
+        env=env,
+        chat_template=template,
+        bos_token=bos_token,
+        eos_token=eos_token,
+        messages=messages,
+    )
+
+
 def deepseek_r1_pt(messages):
     return hf_chat_template(
         model="deepseek-r1/deepseek-r1-7b-instruct", messages=messages
@@ -4031,33 +4170,9 @@ def prompt_factory(
     elif custom_llm_provider == "azure_text":
         return azure_text_pt(messages=messages)
     elif custom_llm_provider == "watsonx":
-        if "granite" in model and "chat" in model:
-            # granite-13b-chat-v1 and granite-13b-chat-v2 use a specific prompt template
-            return ibm_granite_pt(messages=messages)
-        elif "ibm-mistral" in model and "instruct" in model:
-            # models like ibm-mistral/mixtral-8x7b-instruct-v01-q use the mistral instruct prompt template
-            return mistral_instruct_pt(messages=messages)
-        elif "meta-llama/llama-3" in model and "instruct" in model:
-            # https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
-            return custom_prompt(
-                role_dict={
-                    "system": {
-                        "pre_message": "<|start_header_id|>system<|end_header_id|>\n",
-                        "post_message": "<|eot_id|>",
-                    },
-                    "user": {
-                        "pre_message": "<|start_header_id|>user<|end_header_id|>\n",
-                        "post_message": "<|eot_id|>",
-                    },
-                    "assistant": {
-                        "pre_message": "<|start_header_id|>assistant<|end_header_id|>\n",
-                        "post_message": "<|eot_id|>",
-                    },
-                },
-                messages=messages,
-                initial_prompt_value="<|begin_of_text|>",
-                final_prompt_value="<|start_header_id|>assistant<|end_header_id|>\n",
-            )
+        from litellm.llms.watsonx.chat.transformation import IBMWatsonXChatConfig
+        return IBMWatsonXChatConfig.apply_prompt_template(model=model, messages=messages)
+        
     try:
         if "meta-llama/llama-2" in model and "chat" in model:
             return llama_2_chat_pt(messages=messages)