Make token and stat callback optional in multimodal runner (#13950)

pytorchbot · jackzhxng · web-flow · commit e289f6cd6423 · 2025-09-04T14:40:01.000-04:00
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: #13872 by @jackzhxng ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/jackzhxng/36/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/jackzhxng/36/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/jackzhxng/35/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/jackzhxng/36/orig @diff-train-skip-merge --------- Co-authored-by: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
@@ -65,8 +65,8 @@ Error MultimodalRunner::load() {
 Error MultimodalRunner::generate(
     const std::vector<MultimodalInput>& inputs,
     const GenerationConfig& config,
-    std::function<void(const std::string&)>& token_callback,
-    std::function<void(const Stats&)>& stats_callback) {
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
   if (inputs.empty()) {
     ET_LOG(Error, "MultimodalInput vector cannot be empty");
     return Error::InvalidArgument;
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
@@ -116,8 +116,8 @@ class ET_EXPERIMENTAL MultimodalRunner {
   virtual ::executorch::runtime::Error generate(
       const std::vector<MultimodalInput>& inputs,
       const GenerationConfig& config,
-      std::function<void(const std::string&)>& token_callback,
-      std::function<void(const Stats&)>& stats_callback);
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {});
 
   inline void stop() {
     text_token_generator_->stop();
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
@@ -36,9 +36,9 @@ class ET_EXPERIMENTAL TextTokenGenerator {
 
   /**
    * Token generation loop.
-   * @param tokens prompt tokens as well as the first token generated by
-   * prefill.
-   * @param start_pos the start position of the new tokens, based on how many
+   * @param tokens The first token generated by prefill, if using kv cache. Else
+   * the prompt tokens + the first token generated by prefill.
+   * @param start_pos The start position of the new tokens, based on how many
    * prompt tokens is prefilled.
    * @param max_new_tokens Maximum number of new tokens to generate.
    * @param temperature controls the randomness of predictions by scaling the