First commit

rohansjoshi · rohansjoshi · commit cc4c2e59f130 · 2025-09-12T08:54:02.000-07:00
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -77,18 +77,20 @@ def __init__(self, llava):
             super().__init__()
             self.text_model = llava.text_model
 
-        def forward(self, input_pos, embeddings):
-            return self.text_model(None, {"input_pos": input_pos}, embeddings)
+        def forward(self, cache_positions, embeddings):
+            return self.text_model(None, {"input_pos": cache_positions[:1]}, embeddings)
 
     llava_text_model = LlavaTextModel(llava)
-
     text_model_em = LLMEdgeManager(
         model=llava_text_model,
         modelname="llava_text_model",
         max_seq_len=llava.text_model_args.max_seq_len,
         dtype=DType.fp32,
         use_kv_cache=True,
-        example_inputs=(torch.tensor([0], dtype=torch.int64), embeddings),
+        example_inputs=(
+            torch.tensor(list(range(embeddings.shape[1])), dtype=torch.int64),
+            embeddings,
+        ),
         dynamic_shapes=dynamic_shapes,
     )
 
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
@@ -6,8 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/llava/runner/llava_runner.h>
+#include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
 #include <gflags/gflags.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 #define STB_IMAGE_IMPLEMENTATION
 #include <stb_image.h>
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
@@ -44,7 +47,10 @@ DEFINE_int32(
     -1,
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
-using executorch::extension::llm::Image;
+using ::executorch::extension::llm::Image;
+using ::executorch::extension::llm::make_image_input;
+using ::executorch::extension::llm::make_text_input;
+using ::executorch::extension::llm::MultimodalInput;
 
 void load_image(const std::string& image_path, Image& image) {
   int width, height, channels;
@@ -127,14 +133,53 @@ int32_t main(int32_t argc, char** argv) {
         ->_unsafe_reset_threadpool(num_performant_cores);
   }
 #endif
-  // create llama runner
-  example::LlavaRunner runner(model_path, tokenizer_path, temperature);
+  // Load tokenizer
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
+      std::make_unique<tokenizers::Llama2cTokenizer>();
+  tokenizer->load(tokenizer_path);
+  if (tokenizer == nullptr) {
+    ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
+    return 1;
+  }
+
+  // Create multimodal runner
+  std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
+      ::executorch::extension::llm::create_multimodal_runner(
+          model_path, std::move(tokenizer));
+  if (runner == nullptr) {
+    ET_LOG(Error, "Failed to create multimodal runner");
+    return 1;
+  }
 
+  // Load runner
+  auto load_error = runner->load();
+  if (load_error != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to load multimodal runner");
+    return 1;
+  }
+
+  // Prepare inputs
+  static const char* kPresetPrompt =
+      "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: ";
   Image image;
   load_image(image_path, image);
-  std::vector<Image> images = {image};
+  std::vector<MultimodalInput> inputs = {
+      make_text_input(std::string(kPresetPrompt)),
+      make_image_input(image),
+      make_text_input(std::string(prompt)),
+  };
+
+  ::executorch::extension::llm::GenerationConfig config;
+  config.temperature = temperature;
+
+  // Generate
+  ET_LOG(Info, "Starting generation...");
+  auto error = runner->generate(inputs, config);
+  if (error != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to generate with multimodal runner");
+    return 1;
+  }
 
-  // generate
-  runner.generate(std::move(images), prompt, seq_len);
+  printf("\n");
   return 0;
 }
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
@@ -405,5 +405,5 @@ def _get_image_dynamic_shapes(self):
 
     def _get_prompt_dynamic_shapes(self):
         dim = torch.export.Dim("token_dim", min=2, max=self.max_seq_len)
-        text_model_dynamic_shapes = ({0: 1}, {1: dim})
+        text_model_dynamic_shapes = ({0: dim}, {1: dim})
         return text_model_dynamic_shapes