Make generateResponseAsync() return a ListenableFuture and add ProgressCallback to its arguments

schmidt-sebastian · copybara-github · commit c7e2ea5046dc · 2025-02-28T16:05:55.000-08:00
PiperOrigin-RevId: 732288089
diff --git a/mediapipe/tasks/cc/genai/inference/c/llm_inference_engine.h b/mediapipe/tasks/cc/genai/inference/c/llm_inference_engine.h
@@ -180,7 +180,7 @@ typedef struct {
 // LlmResponseContext is the return type for
 // LlmInferenceEngine_Session_PredictSync.
 typedef struct {
-  // An array of string. The size of the array depends on the number of
+  // An array of strings. The size of the array depends on the number of
   // responses.
   char** response_array;
 
diff --git a/mediapipe/tasks/java/com/google/mediapipe/tasks/genai/BUILD b/mediapipe/tasks/java/com/google/mediapipe/tasks/genai/BUILD
@@ -53,6 +53,8 @@ android_library(
         "//mediapipe/tasks/java/com/google/mediapipe/tasks/genai/llminference/jni/proto:llm_options_java_proto_lite",
         "//mediapipe/tasks/java/com/google/mediapipe/tasks/genai/llminference/jni/proto:llm_response_context_java_proto_lite",
         "//third_party:autovalue",
+        "//third_party/java/android_libs/guava_jdk5:concurrent",
+        "//third_party/java/android_libs/guava_jdk5:listenablefuture",
         "@com_google_protobuf//:protobuf_javalite",
         "@maven//:com_google_guava_guava",
     ],
diff --git a/mediapipe/tasks/java/com/google/mediapipe/tasks/genai/llminference/LlmInference.java b/mediapipe/tasks/java/com/google/mediapipe/tasks/genai/llminference/LlmInference.java
@@ -1,9 +1,9 @@
 package com.google.mediapipe.tasks.genai.llminference;
 
-import static com.google.mediapipe.tasks.genai.llminference.LlmInferenceSession.decodeResponse;
 
 import android.content.Context;
 import com.google.auto.value.AutoValue;
+import com.google.common.util.concurrent.ListenableFuture;
 import com.google.mediapipe.tasks.genai.llminference.jni.proto.LlmOptionsProto.LlmModelSettings;
 import com.google.mediapipe.tasks.genai.llminference.jni.proto.LlmOptionsProto.LlmModelSettings.LlmPreferredBackend;
 import java.util.Collections;
@@ -77,41 +77,12 @@ public static LlmInference createFromOptions(Context context, LlmInferenceOption
       }
     }
 
-    return new LlmInference(context, STATS_TAG, modelSettings.build(), options.resultListener());
+    return new LlmInference(context, STATS_TAG, modelSettings.build());
   }
 
   /** Constructor to initialize an {@link LlmInference}. */
-  private LlmInference(
-      Context context,
-      String taskName,
-      LlmModelSettings modelSettings,
-      Optional<ProgressListener<String>> resultListener) {
-    Optional<ProgressListener<List<String>>> llmResultListener;
-    if (resultListener.isPresent()) {
-      llmResultListener =
-          Optional.of(
-              new ProgressListener<List<String>>() {
-                private boolean receivedFirstToken = false;
-
-                @Override
-                public void run(List<String> partialResult, boolean done) {
-                  String result =
-                      decodeResponse(
-                          partialResult, /* stripLeadingWhitespace= */ !receivedFirstToken);
-                  if (done) {
-                    receivedFirstToken = false; // Reset to initial state
-                    resultListener.get().run(result, done);
-                  } else if (!result.isEmpty()) {
-                    receivedFirstToken = true;
-                    resultListener.get().run(result, done);
-                  }
-                }
-              });
-    } else {
-      llmResultListener = Optional.empty();
-    }
-
-    this.taskRunner = new LlmTaskRunner(context, taskName, modelSettings, llmResultListener);
+  private LlmInference(Context context, String taskName, LlmModelSettings modelSettings) {
+    this.taskRunner = new LlmTaskRunner(context, taskName, modelSettings);
     this.implicitSession = new AtomicReference<>();
   }
 
@@ -136,23 +107,50 @@ public String generateResponse(String inputText) {
   }
 
   /**
-   * Generates a response based on the input text. This method cannot be called while other queries
-   * are active.
+   * Asynchronously generates a response based on the input text. This method cannot be called while
+   * other queries are active.
    *
-   * <p>This function creates a new session for each call. If you want to have a stateful inference,
-   * use {@link LlmInferenceSession#generateResponseAsync()} instead.
+   * <p>This function creates a new session for each call and returns the complete response as a
+   * {@link ListenableFuture}. If you want to have a stateful inference, use {@link
+   * LlmInferenceSession#generateResponseAsync()} instead.
    *
    * <p>Note: You cannot invoke simultaneous response generation calls on active sessions created
    * using the same {@link LlmInference}. You have to wait for the currently running response
    * generation call to complete before initiating another one.
    *
    * @param inputText a {@link String} for processing.
+   * @return a {@link ListenableFuture} with the complete response once the inference is complete.
    * @throws IllegalStateException if the inference fails.
    */
-  public void generateResponseAsync(String inputText) {
+  public ListenableFuture<String> generateResponseAsync(String inputText) {
     LlmInferenceSession session = resetImplicitSession();
     session.addQueryChunk(inputText);
-    session.generateResponseAsync();
+    return session.generateResponseAsync();
+  }
+
+  /**
+   * Asynchronously generates a response based on the input text and emits partial results. This
+   * method cannot be called while other queries are active.
+   *
+   * <p>This function creates a new session for each call and returns the complete response as a
+   * {@link ListenableFuture} and invokes the {@code progressListener} as the response is generated.
+   * If you want to have a stateful inference, use {@link
+   * LlmInferenceSession#generateResponseAsync()} instead.
+   *
+   * <p>Note: You cannot invoke simultaneous response generation calls on active sessions created
+   * using the same {@link LlmInference}. You have to wait for the currently running response
+   * generation call to complete before initiating another one.
+   *
+   * @param inputText a {@link String} for processing.
+   * @param progressListener a {@link ProgressListener} to receive partial results.
+   * @return a {@link ListenableFuture} with the complete response once the inference is complete.
+   * @throws IllegalStateException if the inference fails.
+   */
+  public ListenableFuture<String> generateResponseAsync(
+      String inputText, ProgressListener<String> progressListener) {
+    LlmInferenceSession session = resetImplicitSession();
+    session.addQueryChunk(inputText);
+    return session.generateResponseAsync(progressListener);
   }
 
   /**
@@ -211,12 +209,6 @@ public abstract static class Builder {
       /** Sets the model path for the text generator task. */
       public abstract Builder setModelPath(String modelPath);
 
-      /** Sets the result listener to invoke with the async API. */
-      public abstract Builder setResultListener(ProgressListener<String> listener);
-
-      /** Sets the error listener to invoke with the async API. */
-      public abstract Builder setErrorListener(ErrorListener listener);
-
       /** Configures the total number of tokens for input and output). */
       public abstract Builder setMaxTokens(int maxTokens);
 
@@ -263,12 +255,6 @@ public final LlmInferenceOptions build() {
     /** The supported lora ranks for the base model. Used by GPU only. */
     public abstract List<Integer> supportedLoraRanks();
 
-    /** The result listener to use for the {@link LlmInference#generateAsync} API. */
-    public abstract Optional<ProgressListener<String>> resultListener();
-
-    /** The error listener to use for the {@link LlmInference#generateAsync} API. */
-    public abstract Optional<ErrorListener> errorListener();
-
     /** The model options to for vision modality. */
     public abstract Optional<VisionModelOptions> visionModelOptions();
 
diff --git a/mediapipe/tasks/java/com/google/mediapipe/tasks/genai/llminference/LlmInferenceSession.java b/mediapipe/tasks/java/com/google/mediapipe/tasks/genai/llminference/LlmInferenceSession.java
@@ -1,6 +1,8 @@
 package com.google.mediapipe.tasks.genai.llminference;
 
 import com.google.auto.value.AutoValue;
+import com.google.common.util.concurrent.ListenableFuture;
+import com.google.common.util.concurrent.SettableFuture;
 import com.google.mediapipe.framework.image.MPImage;
 import com.google.mediapipe.tasks.genai.llminference.LlmTaskRunner.LlmSession;
 import com.google.mediapipe.tasks.genai.llminference.jni.proto.LlmOptionsProto.LlmSessionConfig;
@@ -97,20 +99,57 @@ public String generateResponse() {
   }
 
   /**
-   * Generates a response based on the previously added query chunks asynchronously.
+   * Asynchronously generates a response based on the input text. This method cannot be called while
+   * other queries are active.
    *
-   * <p>The {@code resultListener} callback of the {@link LlmInference} instance returns the partial
-   * responses from the LLM. Use {@link #addQueryChunk(String)} to add at least one query chunk
-   * before calling this function.
+   * <p>The method returns the complete response as a {@link ListenableFuture}. Use {@link
+   * #addQueryChunk(String)} to add at least one query chunk before calling this function.
+   *
+   * <p>Note: You cannot invoke simultaneous response generation calls on active sessions created
+   * using the same {@link LlmInference}. You have to wait for the currently running response
+   * generation call to complete before initiating another one.
+   *
+   * @return a {@link ListenableFuture} with the complete response once the inference is complete.
+   * @throws IllegalStateException if the inference fails.
+   */
+  public ListenableFuture<String> generateResponseAsync() {
+    return generateResponseAsync((unused1, unused2) -> {});
+  }
+
+  /**
+   * Asynchronously generates a response based on the input text and emits partial results. This
+   * method cannot be called while other queries are active.
+   *
+   * <p>The method returns the complete response as a {@link ListenableFuture} and invokes the
+   * {@code progressListener} as the response is generated. Use {@link #addQueryChunk(String)} to
+   * add at least one query chunk before calling this function.
    *
    * <p>Note: You cannot invoke simultaneous response generation calls on active sessions created
    * using the same {@link LlmInference}. You have to wait for the currently running response
    * generation call to complete before initiating another one.
    *
+   * @param progressListener a {@link ProgressListener} to receive partial results.
+   * @return a {@link ListenableFuture} with the complete response once the inference is complete.
    * @throws IllegalStateException if the inference fails.
    */
-  public void generateResponseAsync() {
-    taskRunner.predictAsync(session);
+  public ListenableFuture<String> generateResponseAsync(ProgressListener<String> progressListener) {
+    SettableFuture<String> future = SettableFuture.create();
+    StringBuilder response = new StringBuilder();
+    taskRunner.predictAsync(
+        session,
+        (partialResult, done) -> {
+          // Not using isEmpty() because it's not available on Android < 30.
+          boolean stripLeadingWhitespace = response.length() == 0;
+          String partialResultDecoded = decodeResponse(partialResult, stripLeadingWhitespace);
+          response.append(partialResultDecoded);
+          if (done) {
+            progressListener.run(partialResultDecoded, done);
+            future.set(response.toString());
+          } else if (!partialResultDecoded.isEmpty()) {
+            progressListener.run(partialResultDecoded, done);
+          }
+        });
+    return future;
   }
 
   /**
@@ -126,7 +165,7 @@ public int sizeInTokens(String text) {
   }
 
   /** Decodes the response from the LLM engine and returns a human-readable string. */
-  static String decodeResponse(List<String> responses, boolean stripLeadingWhitespace) {
+  private static String decodeResponse(List<String> responses, boolean stripLeadingWhitespace) {
     if (responses.isEmpty()) {
       // Technically, this is an error. We should always get at least one response.
       return "";
diff --git a/mediapipe/tasks/java/com/google/mediapipe/tasks/genai/llminference/LlmTaskRunner.java b/mediapipe/tasks/java/com/google/mediapipe/tasks/genai/llminference/LlmTaskRunner.java
@@ -29,7 +29,6 @@
 import com.google.protobuf.InvalidProtocolBufferException;
 import java.nio.ByteBuffer;
 import java.util.List;
-import java.util.Optional;
 import java.util.concurrent.atomic.AtomicBoolean;
 
 /**
@@ -39,10 +38,11 @@
  */
 public final class LlmTaskRunner implements AutoCloseable {
   private final long engineHandle;
-  private final Optional<ProgressListener<List<String>>> resultListener;
   private final long callbackHandle;
   private final AtomicBoolean isProcessing;
 
+  private ProgressListener<List<String>> resultListener = (unused1, unused2) -> {};
+
   /**
    * Describes how pixel bits encode color. A pixel may be an alpha mask, a grayscale, RGB, or ARGB.
    *
@@ -155,20 +155,9 @@ public static final class LlmSession {
     }
   }
 
-  public LlmTaskRunner(
-      Context context,
-      String taskName,
-      LlmModelSettings modelSettings,
-      Optional<ProgressListener<List<String>>> resultListener) {
+  public LlmTaskRunner(Context context, String taskName, LlmModelSettings modelSettings) {
     this.engineHandle = nativeCreateEngine(modelSettings.toByteArray());
-
-    this.resultListener = resultListener;
-    if (resultListener.isPresent()) {
-      this.callbackHandle = nativeRegisterCallback(this);
-    } else {
-      this.callbackHandle = 0;
-    }
-
+    this.callbackHandle = nativeRegisterCallback(this);
     this.isProcessing = new AtomicBoolean(false);
   }
 
@@ -213,20 +202,18 @@ public List<String> predictSync(LlmSession session) {
   }
 
   /** Invokes the LLM with the given session and calls the callback with the result. */
-  public void predictAsync(LlmSession session) {
+  public void predictAsync(LlmSession session, ProgressListener<List<String>> resultListener) {
     validateState();
 
-    if (callbackHandle == 0) {
-      throw new IllegalStateException("No result listener provided.");
-    }
-
     try {
       isProcessing.set(true);
+      this.resultListener = resultListener;
       nativePredictAsync(session.sessionHandle, callbackHandle);
     } catch (Throwable t) {
       // Only reset `isProcessing` if we fail to start the async inference. For successful
       // inferences, we reset `isProcessing` when we receive `done=true`.
       isProcessing.set(false);
+      this.resultListener = (unused1, unused2) -> {};
       throw t;
     }
   }
@@ -265,10 +252,12 @@ private LlmResponseContext parseResponse(byte[] response) {
 
   private void onAsyncResponse(byte[] responseBytes) {
     LlmResponseContext response = parseResponse(responseBytes);
+    ProgressListener<List<String>> resultListener = this.resultListener;
     if (response.getDone()) {
       isProcessing.set(false);
+      this.resultListener = (unused1, unused2) -> {};
     }
-    resultListener.get().run(response.getResponsesList(), response.getDone());
+    resultListener.run(response.getResponsesList(), response.getDone());
   }
 
   @Override