From cde2484439f4f4866d3a0034be6cf06150344a7f Mon Sep 17 00:00:00 2001 From: JunmooB Date: Thu, 31 Jul 2025 13:20:33 +0900 Subject: [PATCH] Add tokenizer override logic for mapped HF model names --- .../openai_frontend/engine/triton_engine.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py index 499cc623e7..8587cbc30a 100644 --- a/python/openai/openai_frontend/engine/triton_engine.py +++ b/python/openai/openai_frontend/engine/triton_engine.py @@ -395,10 +395,16 @@ def _get_tokenizer(self, tokenizer_name: str): def _get_model_metadata(self) -> Dict[str, TritonModelMetadata]: # One tokenizer and creation time shared for all loaded models for now. model_metadata = {} + + # Mapping of custom model identifiers to their corresponding Hugging Face model names + HF_MODEL_NAME_MAP = { + "llama-3.1-8b-instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "mistral-nemo-instruct-2407": "mistralai/Mistral-Nemo-Instruct-2407", + } # Read all triton models and store the necessary metadata for each for name, _ in self.server.models().keys(): - model = self.server.model(name) + model = self.server.model(name) backend = model.config()["backend"] # Explicitly handle ensembles to avoid any runtime validation errors if not backend and model.config()["platform"] == "ensemble": @@ -410,12 +416,19 @@ def _get_model_metadata(self) -> Dict[str, TritonModelMetadata]: lora_names = _get_vllm_lora_names( self.server.options.model_repository, name, model.version ) + # Map to Hugging Face model name if available + hf_model_name = HF_MODEL_NAME_MAP.get(name, name) + # Try to get tokenizer for the mapped model name + tokenizer_override = get_tokenizer(hf_model_name) + # Use the override tokenizer if available; otherwise fall back to default + tokenizer = tokenizer_override if tokenizer_override else self.tokenizer metadata = TritonModelMetadata( name=name, backend=backend, model=model, - tokenizer=self.tokenizer, + # tokenizer=self.tokenizer, + tokenizer=tokenizer, lora_names=lora_names, create_time=self.create_time, request_converter=self._determine_request_converter(backend),