Fix runtime bugs with multi-modal models (microsoft#1701)

kunal-vaishnavi · web-flow · commit ad2a6ed71743 · 2025-08-26T10:35:23.000-07:00
### Description This PR fixes runtime bugs with Phi-4 multi-modal and Gemma-3 vision. It also fixes [this issue](microsoft#1698). ### Motivation and Context Bug 1: The `num_audio_tokens` value contains the correct number of audio tokens but this was not being assigning to `num_audio_tokens_`, which is used to initialize `audio_features_`. https://github.com/microsoft/onnxruntime-genai/blob/b3ddb21fd5a583ca1a45bf416e8f70ff7df2f4ba/src/models/multi_modal.cpp#L121-L124 Bug 2: The image processor for Gemma-3 vision returns `num_img_tokens` as `int32_t`. https://github.com/microsoft/onnxruntime-genai/blob/b3ddb21fd5a583ca1a45bf416e8f70ff7df2f4ba/src/models/gemma_image_processor.cpp#L73-L74 However, the subsequent code that uses `num_img_tokens` is interpreting it as `int64_t`. https://github.com/microsoft/onnxruntime-genai/blob/b3ddb21fd5a583ca1a45bf416e8f70ff7df2f4ba/src/models/multi_modal.cpp#L15-L18 Bug 3: The `tokenizer.apply_chat_template` API does not appear to integrate well with Phi-4 multi-modal. The result from applying the chat template differs from the result when manually constructing the string to tokenize. When manually constructing the string to tokenize (input is 1 image + 1 prompt): ``` <|user|> <|image_1|> describe this<|end|> <|assistant|> ``` When applying the chat template (input is 1 image + 1 prompt): ``` <|user|>[{'type': 'image'}, {'type': 'text', 'text': 'describe this'}]<|end|><|assistant|> ``` Because the `<|image_1|>` token is missing, a runtime error gets raised that the number of image tokens does not match the number of images. For now, the chat template changes to `phi4-mm.py` have been reverted to avoid this error. ### Errors The following errors occurred as a result of these bugs. 1. Out-of-memory allocations CPU: ```python Traceback (most recent call last): File "/home/username/onnxruntime-genai/examples/python/model-vision.py", line 155, in <module> run(args) File "/home/username/onnxruntime-genai/examples/python/model-vision.py", line 114, in run generator.set_inputs(inputs) RuntimeError: std::bad_alloc ``` CUDA: ```python RuntimeError: /onnxruntime_src/onnxruntime/core/framework/bfc_arena.cc:376 void* onnxruntime::BFCArena::AllocateRawInternal(size_t, bool, onnxruntime::Stream*, bool, onnxruntime::WaitNotificationFn) Failed to allocate memory for requested buffer of size 6755300567900160 ``` 2. C++ internal errors ```cpp /opt/rh/gcc-toolset-12/root/usr/include/c++/12/string_view:239: constexpr const std::basic_string_view<_CharT, _Traits>::value_type& std::basic_string_view<_CharT, _Traits>::operator[](size_type) const [with _CharT = char32_t; _Traits = std::char_traits<char32_t>; const_reference = const char32_t&; size_type = long unsigned int]: Assertion '__pos < this->_M_len' failed. ``` 3. Integer overflow errors ```python Traceback (most recent call last): File "C:\Users\username\Downloads\gemma-3-vision-it\run_vision.py", line 179, in <module> run(args) File "C:\Users\username\Downloads\gemma-3-vision-it\run_vision.py", line 137, in run generator.set_inputs(inputs) RuntimeError: D:\a\_work\1\s\onnxruntime\core/common/safeint.h:17 SafeIntExceptionHandler<class onnxruntime::OnnxRuntimeException>::SafeIntOnOverflow Integer overflow ``` 4. Pre-processing errors ```python Traceback (most recent call last): File "/home/username/onnxruntime-genai/examples/python/phi4-mm.py", line 181, in <module> run(args) File "/home/username/onnxruntime-genai/examples/python/phi4-mm.py", line 129, in run inputs = processor(prompt, images=images, audios=audios) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ RuntimeError: Number of image tokens does not match the number of images. Please fix the prompt. ```
diff --git a/examples/python/phi4-mm.py b/examples/python/phi4-mm.py
@@ -5,7 +5,6 @@
 import os
 import glob
 import time
-import json
 from pathlib import Path
 
 import onnxruntime_genai as og
@@ -64,9 +63,8 @@ def run(args: argparse.Namespace):
     model = og.Model(config)
     print("Model loaded")
 
-    tokenizer = og.Tokenizer(model)
     processor = model.create_multimodal_processor()
-    stream = processor.create_stream()
+    tokenizer_stream = processor.create_stream()
 
     interactive = not args.non_interactive
 
@@ -86,44 +84,40 @@ def run(args: argparse.Namespace):
 
         images = None
         audios = None
+        prompt = "<|user|>\n"
 
-        # Validate and open image paths
+        # Get images
         if len(image_paths) == 0:
             print("No image provided")
         else:
-            for image_path in image_paths:
+            for i, image_path in enumerate(image_paths):
                 if not os.path.exists(image_path):
                     raise FileNotFoundError(f"Image file not found: {image_path}")
                 print(f"Using image: {image_path}")
+                prompt += f"<|image_{i+1}|>\n"
             images = og.Images.open(*image_paths)
 
-        # Validate and open audio paths
+        # Get audios
         if len(audio_paths) == 0:
             print("No audio provided")
         else:
-            for audio_path in audio_paths:
+            for i, audio_path in enumerate(audio_paths):
                 if not os.path.exists(audio_path):
                     raise FileNotFoundError(f"Audio file not found: {audio_path}")
                 print(f"Using audio: {audio_path}")
+                prompt += f"<|audio_{i+1}|>\n"
             audios = og.Audios.open(*audio_paths)
 
-        # Get prompt text
+
         if interactive:
             text = input("Prompt: ")
         else:
-            text = args.prompt or "Does the audio summarize what is shown in the image? If not, what is different?"
-
-        # Build multimodal content list
-        content_list = []
-        content_list.extend([{"type": "image"} for _ in image_paths])
-        content_list.extend([{"type": "audio"} for _ in audio_paths])
-        content_list.append({"type": "text", "text": text})
-
-        # Construct messages and apply template
-        messages = [{"role": "user", "content": content_list}]
-        message_json = json.dumps(messages)
-        prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True)
-
+            if args.prompt:
+                text = args.prompt
+            else:
+                text = "Does the audio summarize what is shown in the image? If not, what is different?"
+        prompt += f"{text}<|end|>\n<|assistant|>\n"
+        
         print("Processing inputs...")
         inputs = processor(prompt, images=images, audios=audios)
         print("Processor complete.")
@@ -140,7 +134,7 @@ def run(args: argparse.Namespace):
             generator.generate_next_token()
 
             new_token = generator.get_next_tokens()[0]
-            print(stream.decode(new_token), end="", flush=True)
+            print(tokenizer_stream.decode(new_token), end="", flush=True)
 
         print()
         total_run_time = time.time() - start_time
@@ -177,4 +171,4 @@ def run(args: argparse.Namespace):
         '--non-interactive', action=argparse.BooleanOptionalAction, required=False, help='Non-interactive mode, mainly for CI usage'
     )
     args = parser.parse_args()
-    run(args)
+    run(args)
diff --git a/src/models/gemma_image_processor.cpp b/src/models/gemma_image_processor.cpp
@@ -70,8 +70,8 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr
     }
   }
 
-  std::unique_ptr<OrtValue> num_img_tokens = OrtValue::CreateTensor<int32_t>(allocator, std::vector<int64_t>{1});
-  num_img_tokens->GetTensorMutableData<int32_t>()[0] = static_cast<int32_t>(image_seq_length);
+  std::unique_ptr<OrtValue> num_img_tokens = OrtValue::CreateTensor<int64_t>(allocator, std::vector<int64_t>{1});
+  num_img_tokens->GetTensorMutableData<int64_t>()[0] = static_cast<int64_t>(image_seq_length);
 
   return {std::move(input_ids_value), std::move(token_type_ids), std::move(num_img_tokens)};
 }
diff --git a/src/models/multi_modal.cpp b/src/models/multi_modal.cpp
@@ -119,6 +119,8 @@ SpeechState::SpeechState(const MultiModalLanguageModel& model, const GeneratorPa
       model_{model} {}
 
 void SpeechState::SetExtraInputs(const std::vector<ExtraInput>& extra_inputs, const int64_t num_audio_tokens) {
+  num_audio_tokens_ = num_audio_tokens;
+
   audio_features_ = std::make_unique<MultiModalFeatures>(*this, MultiModalFeatures::Mode::Output,  // Model output
                                                          model_.config_->model.speech.outputs.audio_features,
                                                          -1, num_audio_tokens_);

Original file line number	Diff line number	Diff line change
`@@ -70,8 +70,8 @@ ProcessImagePrompt(const Generators::Tokenizer& tokenizer, const std::string& pr`
`70`	`70`	`}`
`71`	`71`	`}`
`72`	`72`
`73`		`- std::unique_ptr<OrtValue> num_img_tokens = OrtValue::CreateTensor<int32_t>(allocator, std::vector<int64_t>{1});`
`74`		`- num_img_tokens->GetTensorMutableData<int32_t>()[0] = static_cast<int32_t>(image_seq_length);`
	`73`	`+ std::unique_ptr<OrtValue> num_img_tokens = OrtValue::CreateTensor<int64_t>(allocator, std::vector<int64_t>{1});`
	`74`	`+ num_img_tokens->GetTensorMutableData<int64_t>()[0] = static_cast<int64_t>(image_seq_length);`
`75`	`75`
`76`	`76`	`return {std::move(input_ids_value), std::move(token_type_ids), std::move(num_img_tokens)};`
`77`	`77`	`}`