Fix stuff and order

kirklandsign · kirklandsign · commit e18ce859efa8 · 2025-09-12T13:52:10.000-07:00
diff --git a/extension/llm/runner/multimodal_decoder_runner.h b/extension/llm/runner/multimodal_decoder_runner.h
@@ -48,7 +48,7 @@ class ET_EXPERIMENTAL MultimodalDecoderRunner
         &start_pos, {1}, executorch::aten::ScalarType::Long);
     // run text model
     auto outputs_res = ET_UNWRAP(
-        module_->execute(kTextModelMethod, {start_pos_tensor, embeddings}));
+        module_->execute(kTextModelMethod, {embeddings, start_pos_tensor}));
 
     ET_CHECK_MSG(
         outputs_res.size() == 1,
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
@@ -92,39 +92,18 @@ Result<uint64_t> MultimodalPrefiller::prefill(
 
   // 2. Run decoder model for prefill.
 
-  // Get expected shape of cache position tensor, which should be the first (0th
-  // index) argument
-  auto method_meta = ET_UNWRAP(module_->method_meta(kTextModelMethod));
-  auto first_input_info = ET_UNWRAP(method_meta.input_tensor_meta(0));
-  auto first_input_sizes = first_input_info.sizes();
-  auto numel = first_input_sizes[0];
+
+  // Get expected shape of cache position tensor, which should be the second argument
 
   int64_t seq_len = encoder_output.toTensor().size(1);
   if (seq_len == 0) {
     ET_LOG(Error, "The encoder returned an empty output.");
     return ::executorch::runtime::Error::InvalidState;
   }
+  auto cache_position_tensor = ET_UNWRAP(populate_start_pos_tensor(module_, start_pos, seq_len));
 
-  executorch::extension::TensorPtr cache_position_tensor;
-  if (numel > 1) {
-    // `cache_position` goes from start_pos to start_pos +
-    // encoder_output.size(1). e.g. if start_pos = 2 and encoder_output.size(1)
-    // = 5, cache_position_tensor should be [2, 3, 4, 5, 6].
-    std::vector<int64_t> cache_positions(seq_len);
-    for (int64_t i = 0; i < seq_len; ++i) {
-      cache_positions[i] = start_pos + i;
-    }
-    cache_position_tensor = ::executorch::extension::from_blob(
-        cache_positions.data(),
-        {static_cast<int>(seq_len)},
-        executorch::aten::ScalarType::Long);
-  } else {
-    // Cache position is size 1.
-    cache_position_tensor = ::executorch::extension::from_blob(
-        &start_pos, {1}, executorch::aten::ScalarType::Long);
-  }
   auto prefill_result = module_->execute(
-      kTextModelMethod, {cache_position_tensor, encoder_output});
+      kTextModelMethod, {encoder_output, cache_position_tensor});
   if (prefill_result.error() != ::executorch::runtime::Error::Ok) {
     return prefill_result.error();
   }
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
@@ -53,20 +53,7 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
     auto numel = sizes[0];
     std::vector<::executorch::aten::SizesType> sizes_vec = {numel};
 
-    TensorPtr start_pos_tensor;
-    if (numel > 1) {
-      // If we are here, model is exported with cache_positions, create a tensor
-      // with the same length as input_ids. Assuming the last dimension is the
-      // one with the variable token length, for example [1, S] or [1, 1, S]
-      sizes_vec[sizes_vec.size() - 1] = tokens->numel();
-      start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long);
-      torch::executor::native::arange_out_impl(
-          start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor);
-    } else {
-      // Assuming model is exported with input_pos, create a tensor with size 1
-      start_pos_tensor = from_blob(
-          &start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
-    }
+    auto start_pos_tensor = ET_UNWRAP(populate_start_pos_tensor(module_, start_pos, tokens->numel()));
 
     std::vector<runtime::EValue> inputs;
     auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor);
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
@@ -7,6 +7,9 @@
  */
 
 #pragma once
+#include <executorch/extension/llm/runner/constants.h>
+#include <executorch/extension/llm/runner/multimodal_prefiller.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/platform/compiler.h>
 #include <stdio.h>
 #include <time.h>
@@ -99,6 +102,35 @@ ET_EXPERIMENTAL size_t inline get_rss_bytes() {
   // when this changed.
   return 0;
 }
+
+inline runtime::Result<TensorPtr> populate_start_pos_tensor(Module* module, int64_t& start_pos, int seq_len) {
+  // Get expected shape of cache position tensor, which should be the second argument
+  auto method_meta = ET_UNWRAP(module->method_meta(kTextModelMethod));
+  auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
+  auto second_input_sizes = second_input_info.sizes();
+  auto numel = second_input_sizes[0];
+
+  TensorPtr start_pos_tensor;
+  std::vector<::executorch::aten::SizesType> sizes_vec = {numel};
+  if (numel > 1) {
+    // `cache_position` goes from start_pos to start_pos +
+    // encoder_output.size(1). e.g. if start_pos = 2 and encoder_output.size(1)
+    // = 5, cache_position_tensor should be [2, 3, 4, 5, 6].
+    std::vector<int64_t> cache_positions(seq_len);
+    for (int64_t i = 0; i < seq_len; ++i) {
+      cache_positions[i] = start_pos + i;
+    }
+    return ::executorch::extension::from_blob(
+        cache_positions.data(),
+        {static_cast<int>(seq_len)},
+        executorch::aten::ScalarType::Long);
+  } else {
+    // Cache position is size 1.
+    return ::executorch::extension::from_blob(
+        &start_pos, {1}, executorch::aten::ScalarType::Long);
+  }
+}
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch