pytorch · jackzhxng · Sep 2, 2025 · Aug 25, 2025 · Aug 26, 2025 · Aug 26, 2025
@@ -65,8 +65,8 @@ Error MultimodalRunner::load() {
 Error MultimodalRunner::generate(
     const std::vector<MultimodalInput>& inputs,
     const GenerationConfig& config,
-    std::function<void(const std::string&)>& token_callback,
-    std::function<void(const Stats&)>& stats_callback) {
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
   if (inputs.empty()) {
     ET_LOG(Error, "MultimodalInput vector cannot be empty");
     return Error::InvalidArgument;

@@ -116,8 +116,8 @@ class ET_EXPERIMENTAL MultimodalRunner {
   virtual ::executorch::runtime::Error generate(
       const std::vector<MultimodalInput>& inputs,
       const GenerationConfig& config,
-      std::function<void(const std::string&)>& token_callback,
-      std::function<void(const Stats&)>& stats_callback);
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {});
 
   inline void stop() {
     text_token_generator_->stop();

@@ -36,9 +36,9 @@ class ET_EXPERIMENTAL TextTokenGenerator {
 
   /**
    * Token generation loop.
-   * @param tokens prompt tokens as well as the first token generated by
-   * prefill.
-   * @param start_pos the start position of the new tokens, based on how many
+   * @param tokens The first token generated by prefill, if using kv cache. Else
+   * the prompt tokens + the first token generated by prefill.
+   * @param start_pos The start position of the new tokens, based on how many
    * prompt tokens is prefilled.
    * @param max_new_tokens Maximum number of new tokens to generate.
    * @param temperature controls the randomness of predictions by scaling the