refactor and docs

evezhier · evezhier · commit df9885426518 · 2025-12-05T15:49:58.000-08:00
Signed-off-by: Olya Kozlova &lt;okozlova@nvidia.com&gt;
diff --git a/examples/llm-api/quickstart_multimodal.py b/examples/llm-api/quickstart_multimodal.py
@@ -272,14 +272,17 @@ def main():
         args.prompt = example_medias_and_prompts[args.modality]["prompt"]
     if args.media is None:
         args.media = example_medias_and_prompts[args.modality]["media"]
+    
+    #FIXME WAR for mistral-common processors
+    keep_source_media=(args.model_type=="mistral3" and args.checkpoint_format == "mistral_large_3")
     inputs = default_multimodal_input_loader(
         tokenizer=llm.tokenizer,
         model_dir=str(llm._hf_model_dir),
         model_type=model_type,
         modality=args.modality,
         prompts=args.prompt,
         media=args.media,
-        keep_source_media=(args.checkpoint_format == "mistral_large_3"),
+        keep_source_media=keep_source_media,
         processor=getattr(llm, "input_processor", None),
         image_data_format=image_format,
         num_frames=args.num_frames,
diff --git a/examples/models/core/mistral_large_3/README.md b/examples/models/core/mistral_large_3/README.md
@@ -7,6 +7,22 @@ export mistral_large_3_model_path=<mistral_large_3_model_path>
 export mistral_large_3_eagle_model_path=<mistral_large_3_eagle_model_path>
 ```
 
+## Multimodal run
+
+* Run the Mistral Large V3 by `quickstart_multimodal.py`
+
+```bash
+mpirun -n 1 --allow-run-as-root --oversubscribe python3 examples/llm-api/quickstart_multimodal.py \
+    --model_dir ${mistral_large_3_model_path} \
+    --tp_size 4 \
+    --moe_ep_size 4 \
+    --max_tokens 100 \
+    --checkpoint_format mistral_large_3 \
+    --model_type mistral3 \ 
+    --kv_cache_fraction 0.25 \
+    --moe_backend TRTLLM # optional
+```
+
 ## LLM-only run
 
 * Run the Mistral Large V3 by `quickstart_advanced.py`
diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py
@@ -336,14 +336,12 @@ def __init__(
         if tokenizer is not None:
             self._tokenizer = tokenizer
         else:
-            try:
-                self._tokenizer = AutoTokenizer.from_pretrained(
-                    model_path,
-                    config=config,
-                    use_fast=self.use_fast,
-                    trust_remote_code=trust_remote_code)
-            except ValueError:
-                self._tokenizer = MistralTokenizer.from_pretrained(model_path)
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                model_path,
+                config=config,
+                use_fast=self.use_fast,
+                trust_remote_code=trust_remote_code)
+
         self._model_path = model_path
         if isinstance(self._tokenizer, MistralTokenizer):
             self._processor = MistralCommonImageProcessor(
@@ -353,6 +351,9 @@ def __init__(
                 model_path,
                 use_fast=self.use_fast,
                 trust_remote_code=trust_remote_code)
+        
+        logger.debug(f"Mistral3InputProcessor: using {type(self._processor)} preprocessor")
+        logger.debug(f"Mistral3InputProcessor: using {type(self._tokenizer)} tokenizer")
 
     @property
     def config(self) -> PretrainedConfig:
@@ -443,6 +444,37 @@ def get_mm_special_token_ids(self) -> torch.Tensor:
             self.processor.image_end_token_id,
         ])
 
+class MistralCommonInputProcessor(Mistral3InputProcessor):
+    def __init__(
+        self,
+        model_path: str,
+        config: PretrainedConfig,
+        tokenizer: Optional[AutoTokenizer],
+        trust_remote_code: bool = False,
+        **kwargs,
+    ):
+        tokenizer = self.load_tokenizer(model_path, config=config)
+        super().__init__(model_path=model_path,
+                         config=config,
+                         tokenizer=tokenizer,
+                         **kwargs)
+
+    @staticmethod
+    def load_tokenizer(model_path: str, config: PretrainedConfig, checkpoint_format: Optional[str] = "mistral_large_3"):
+        if checkpoint_format == "mistral_large_3":
+            try:
+                return MistralTokenizer.from_pretrained(model_path)
+            
+            except ValueError:
+                logger.info(f"Could not load mistral-common tokenizer from {model_path}, falling back to HuggingFace")
+
+        tokenizer = AutoTokenizer.from_pretrained(
+                    model_path,
+                    config=config,
+                    use_fast=True,
+                    trust_remote_code=True)
+        return tokenizer
+
 
 class Mistral3Gate(nn.Module):
 
@@ -478,26 +510,27 @@ def load_weights(self, weights: List[Dict]):
 @register_auto_model("Mistral3ForConditionalGeneration")
 @register_auto_model("PixtralForConditionalGeneration")
 @register_input_processor(
-    Mistral3InputProcessor,
-    model_type="mistral3_hf",
+    MistralCommonInputProcessor,
+    model_type="mistral3",
     placeholder_metadata=MultimodalPlaceholderMetadata(
         placeholder_map={
+            # NOTE: mistral-common uses the tokenizer to set placeholders, this will be ignored
             "image": "[IMG]",
         },
-        # NOTE: for mistral3 multimodal models, it does not strictly have to be before the text.
-        # Ref: https://github.com/mistralai/mistral-common/blob/039465db2bdc0486df36365c9bdb428188482a18/
-        #      src/mistral_common/tokens/tokenizers/base.py#L326
-        # However, accuracy tests show that the model generates higher quality output when the image
-        # precedes the text (the relative difference can be as much as ~30% for both vLLM and TRT-LLM).
         placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
     ))
 @register_input_processor(
     Mistral3InputProcessor,
-    model_type="mistral3",
+    model_type="mistral3_hf",
     placeholder_metadata=MultimodalPlaceholderMetadata(
         placeholder_map={
             "image": "[IMG]",
         },
+        # NOTE: for mistral3 multimodal models, it does not strictly have to be before the text.
+        # Ref: https://github.com/mistralai/mistral-common/blob/039465db2bdc0486df36365c9bdb428188482a18/
+        #      src/mistral_common/tokens/tokenizers/base.py#L326
+        # However, accuracy tests show that the model generates higher quality output when the image
+        # precedes the text (the relative difference can be as much as ~30% for both vLLM and TRT-LLM).
         placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
     ))
 class Mistral3VLM(PreTrainedModel):
diff --git a/tensorrt_llm/inputs/registry.py b/tensorrt_llm/inputs/registry.py
@@ -9,8 +9,7 @@
 import torch
 from PIL import Image
 from torch import Tensor, nn
-from transformers import (AutoProcessor, AutoTokenizer, PretrainedConfig,
-                          PreTrainedTokenizerBase)
+from transformers import (AutoProcessor, PretrainedConfig, PreTrainedTokenizerBase)
 
 import tensorrt_llm
 
@@ -595,16 +594,13 @@ def create_input_processor(
 
         # FIXME support both HF and mistral-common paths in a better way
         if tokenizer is None:
-            try:
-                tokenizer = AutoTokenizer.from_pretrained(
-                    model_path_or_dir,
-                    config=config,
-                    use_fast=True,
-                    trust_remote_code=True)
-
-            except ValueError:
-                from tensorrt_llm.llmapi.tokenizer import MistralTokenizer
-                tokenizer = MistralTokenizer.from_pretrained(model_path_or_dir)
+            from tensorrt_llm._torch.models.modeling_mistral import \
+                MistralCommonInputProcessor
+            tokenizer = MistralCommonInputProcessor.load_tokenizer(
+                model_path_or_dir, config=None)
+
+        print(f"loaded tokenizer: {type(tokenizer)}")
+
     else:
         logger.debug(
             f"checkpoint_format={checkpoint_format}; skipping HF config load.")