Update on "[ExecuTorch][Llama] Change runner to enable chunked prefill"

kimishpatel · kimishpatel · commit d6c9e456ad19 · 2025-03-31T15:46:54.000-07:00
This diff adds code to chunk prompt longer than max_seq_len to enable prefill of larger context Differential Revision: [D71833061](https://our.internmc.facebook.com/intern/diff/D71833061/) [ghstack-poisoned]
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
@@ -252,10 +252,10 @@ Error Runner::generate(
     std::vector<uint64_t> prompt_tokens_to_process(num_tokens_to_prefill_with);
     std::copy(
         prompt_tokens.begin() + num_tokens_to_process,
-        prompt_tokens.begin() + num_tokens_to_process + num_tokens_to_prefill_with,
+        prompt_tokens.begin() + num_tokens_to_process +
+            num_tokens_to_prefill_with,
         prompt_tokens_to_process.begin());
-    auto prefill_res =
-        text_prefiller_->prefill(prompt_tokens_to_process, pos);
+    auto prefill_res = text_prefiller_->prefill(prompt_tokens_to_process, pos);
     ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
     cur_token = prefill_res.get();
     num_tokens_to_process += num_tokens_to_prefill_with;