[Executorch][llama] Change runner to decouple prompt length from sequence

kimishpatel · kimishpatel · commit 5b3859b947fa · 2025-03-25T12:26:45.000-07:00
length Following previous diff now we can utilize entire kv cache to generate more tokens than max prompt length allowed. Differential Revision: [D69073908](https://our.internmc.facebook.com/intern/diff/D69073908/) ghstack-source-id: 273982703 Pull Request resolved: #9594
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
@@ -31,6 +31,7 @@ static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
 static constexpr auto kBosId = "get_bos_id";
 static constexpr auto kEosIds = "get_eos_ids";
 static constexpr auto kMaxSeqLen = "get_max_seq_len";
+static constexpr auto kMaxContextLen = "get_max_context_len";
 static constexpr auto kVocabSize = "get_vocab_size";
 static constexpr auto kUseKVCache = "use_kv_cache";
 static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
@@ -49,6 +50,7 @@ Runner::Runner(
       metadata_({
           {kEnableDynamicShape, false},
           {kMaxSeqLen, 128},
+          {kMaxContextLen, 128},
           {kUseKVCache, true},
           {kUseSDPAWithKVCache, false},
       }) {
@@ -201,9 +203,9 @@ Error Runner::generate(
   shouldStop_ = false;
 
   // Set the sequence length to the max seq length if not provided
-  seq_len = (seq_len > 0 && seq_len <= metadata_.at(kMaxSeqLen))
+  seq_len = (seq_len > 0 && seq_len <= metadata_.at(kMaxContextLen))
       ? seq_len
-      : metadata_.at(kMaxSeqLen);
+      : metadata_.at(kMaxContextLen);
 
   ::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
       prompt,