Fix

kirklandsign · kirklandsign · commit 9b2610e73c31 · 2025-09-15T14:29:52.000-07:00
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
@@ -100,8 +100,10 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     ET_LOG(Error, "The encoder returned an empty output.");
     return ::executorch::runtime::Error::InvalidState;
   }
+  std::vector<int64_t> cache_positions;
+
   auto cache_position_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
-      kTextModelMethod, module_, start_pos, seq_len));
+      kTextModelMethod, module_, start_pos, cache_positions, seq_len));
 
   auto prefill_result = module_->execute(
       kTextModelMethod, {encoder_output, cache_position_tensor});
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
@@ -36,9 +36,11 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
   // If only 1 input, we are not using kv cache
   bool use_kv_cache = method_meta.num_inputs() > 1;
 
+  std::vector<int64_t> cache_positions;
+
   if (use_kv_cache) {
     auto start_pos_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
-        "forward", module_, start_pos, tokens->numel()));
+        "forward", module_, start_pos, cache_positions, tokens->numel()));
 
     std::vector<runtime::EValue> inputs;
     auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor);
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
@@ -111,6 +111,7 @@ inline runtime::Result<TensorPtr> populate_start_pos_or_cache_position(
     const char* method_name,
     Module* module,
     int64_t& start_pos,
+    std::vector<int64_t>& cache_positions_underlying_vector,
     int seq_len) {
   // Get expected shape of cache position tensor, which should be the second
   // argument
@@ -119,12 +120,16 @@ inline runtime::Result<TensorPtr> populate_start_pos_or_cache_position(
   auto second_input_sizes = second_input_info.sizes();
   auto numel = second_input_sizes[0];
 
+  for (int i = 0; i < second_input_sizes.size(); ++i) {
+    ET_LOG(Error, "second_input_sizes[%d] = %d", i, second_input_sizes[i]);
+  }
+
   TensorPtr start_pos_tensor;
   if (numel > 1) {
     // `cache_position` goes from start_pos to start_pos +
     // encoder_output.size(1). e.g. if start_pos = 2 and encoder_output.size(1)
     // = 5, cache_position_tensor should be [2, 3, 4, 5, 6].
-    std::vector<int64_t> cache_positions(seq_len);
+    cache_positions_underlying_vector.resize(seq_len);
     for (int64_t i = 0; i < seq_len; ++i) {
       cache_positions[i] = start_pos + i;
     }