Update on "[llm] Support different shape of input_pos"

larryliu0820 · larryliu0820 · commit 6f07be37236c · 2025-06-24T21:54:59.000-07:00
For huggingface models, `forward()` is taking `tokens` as well as `cache_positions`, which is a list of cache indices. This is different than the .pte files `export_llama` gives, which are taking `tokens` and `input_pos` where `input_pos` is a scalar tensor. This PR adds support inside `text_decoder_runner.cpp` to handle both shapes of `input_pos`/`cache_positions`. To make the logic more generic without relying on extra metadata, here I'm adding the logic of inspecting method meta and input tensor info, to make a decision if we want to feed in `input_pos` or `cache_position`. Differential Revision: [D77203700](https://our.internmc.facebook.com/intern/diff/D77203700/) [ghstack-poisoned]
diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -11,25 +11,28 @@
 #pragma once
 
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/tensor/tensor.h>
 
 namespace example {
 
 class ET_EXPERIMENTAL LlavaTextDecoderRunner
     : public executorch::extension::llm::TextDecoderRunner {
  public:
   explicit LlavaTextDecoderRunner(executorch::extension::Module* module)
-      : TextDecoderRunner(module, true) {}
+      : TextDecoderRunner(module) {}
 
   inline executorch::runtime::Result<executorch::aten::Tensor> step(
       executorch::extension::TensorPtr& tokens,
-      executorch::extension::TensorPtr& start_pos) override {
+      int64_t start_pos) override {
     // run token embedding
     auto token_embedding_outputs =
         ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, tokens));
 
+    auto start_pos_tensor = ::executorch::extension::from_blob(
+        &start_pos, {1}, executorch::aten::ScalarType::Long);
     // run text model
     auto outputs_res = ET_UNWRAP(module_->execute(
-        kTextModelMethod, {start_pos, token_embedding_outputs[0]}));
+        kTextModelMethod, {start_pos_tensor, token_embedding_outputs[0]}));
 
     ET_CHECK_MSG(
         outputs_res.size() == 1,
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
@@ -21,7 +21,7 @@ namespace llm {
 
 class ET_EXPERIMENTAL TextDecoderRunner {
  public:
-  TextDecoderRunner(Module* module);
+  explicit TextDecoderRunner(Module* module);
 
   virtual ~TextDecoderRunner() = default;
 
diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h
@@ -44,7 +44,7 @@ class TensorInfo final {
   /**
    * Returns the sizes of the tensor.
    */
-  Span<const ::executorch::aten::SizesType> sizes() const;
+  Span<const int32_t> sizes() const;
 
   /**
    * Returns the dim order of the tensor.