Android MultiModal JNI binding

larryliu0820 · web-flow · commit 75e6413dfea1 · 2024-08-20T18:02:52.000-07:00
Differential Revision: D61568605 Pull Request resolved: #4813
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
@@ -30,7 +30,10 @@ build_android_native_library() {
     -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DANDROID_PLATFORM=android-23 \
+    -DEXECUTORCH_ENABLE_LOGGING=ON \
+    -DEXECUTORCH_LOG_LEVEL=Info \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
+    -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
@@ -60,11 +63,14 @@ build_android_native_library() {
 
   cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release
 
+
   cmake extension/android \
     -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DANDROID_PLATFORM=android-23 \
     -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+    -DEXECUTORCH_ENABLE_LOGGING=ON \
+    -DEXECUTORCH_LOG_LEVEL=Info \
     -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
     -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
     -DCMAKE_BUILD_TYPE=Release \
diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h
@@ -24,9 +24,8 @@ class LlavaImagePrefiller : public ImagePrefiller {
    * @param start_pos The starting position in KV cache of the input in the LLM
    * @return logits of the image prefill.
    */
-  inline Result<exec_aten::Tensor> prefill(
-      Image& image,
-      int64_t start_pos = 0) {
+  inline Result<exec_aten::Tensor> prefill(Image& image, int64_t start_pos = 0)
+      override {
     ManagedTensor managed_images(
         image.data.data(), {3, image.height, image.width}, ScalarType::Byte);
     // Run image encoder
diff --git a/examples/models/llava/runner/targets.bzl b/examples/models/llava/runner/targets.bzl
@@ -0,0 +1,22 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "runner",
+        srcs = ["llava_runner.cpp"],
+        exported_headers = ["llava_runner.h", "llava_image_prefiller.h", "llava_text_decoder_runner.h"],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_deps = [
+            "//executorch/backends/xnnpack:xnnpack_backend",
+            "//executorch/extension/llm/runner:runner_lib",
+            "//executorch/extension/llm/tokenizer:bpe_tokenizer",
+            "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/runner_util:managed_tensor",
+            "//executorch/extension/module:module",
+            "//executorch/kernels/quantized:generated_lib",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/util:tensor_util",
+        ],
+    )
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
@@ -79,6 +79,11 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     TARGET llama_runner PROPERTY IMPORTED_LOCATION ${LLAMA_RUNNER_PATH}
   )
 
+  add_subdirectory(
+    ${EXECUTORCH_ROOT}/examples/models/llava/runner
+    ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llava/runner
+  )
+
   set(CUSTOM_OPS_PATH
       ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops/libcustom_ops.a
   )
@@ -116,6 +121,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     executorch_llama_jni
     ${link_libraries}
     llama_runner
+    llava_runner
     custom_ops
     cpublas
     eigen_blas
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
@@ -16,6 +16,8 @@
 #include <vector>
 
 #include <executorch/examples/models/llama2/runner/runner.h>
+#include <executorch/examples/models/llava/runner/llava_runner.h>
+#include <executorch/extension/llm/runner/image.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
@@ -90,21 +92,29 @@ class ExecuTorchLlamaJni
     : public facebook::jni::HybridClass<ExecuTorchLlamaJni> {
  private:
   friend HybridBase;
+  int model_type_category_;
   std::unique_ptr<Runner> runner_;
+  std::unique_ptr<MultimodalRunner> multi_modal_runner_;
 
  public:
   constexpr static auto kJavaDescriptor =
       "Lorg/pytorch/executorch/LlamaModule;";
 
+  constexpr static int MODEL_TYPE_CATEGORY_LLM = 1;
+  constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2;
+
   static facebook::jni::local_ref<jhybriddata> initHybrid(
       facebook::jni::alias_ref<jclass>,
+      jint model_type_category,
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
       jfloat temperature) {
-    return makeCxxInstance(model_path, tokenizer_path, temperature);
+    return makeCxxInstance(
+        model_type_category, model_path, tokenizer_path, temperature);
   }
 
   ExecuTorchLlamaJni(
+      jint model_type_category,
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
       jfloat temperature) {
@@ -119,30 +129,72 @@ class ExecuTorchLlamaJni
     }
 #endif
 
-    runner_ = std::make_unique<Runner>(
-        model_path->toStdString().c_str(),
-        tokenizer_path->toStdString().c_str(),
-        temperature);
+    model_type_category_ = model_type_category;
+    if (model_type_category == MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      multi_modal_runner_ = std::make_unique<LlavaRunner>(
+          model_path->toStdString().c_str(),
+          tokenizer_path->toStdString().c_str(),
+          temperature);
+    } else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) {
+      runner_ = std::make_unique<Runner>(
+          model_path->toStdString().c_str(),
+          tokenizer_path->toStdString().c_str(),
+          temperature);
+    }
   }
 
   jint generate(
+      facebook::jni::alias_ref<jintArray> image,
+      jint width,
+      jint height,
+      jint channels,
       facebook::jni::alias_ref<jstring> prompt,
       jint seq_len,
       facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback) {
-    runner_->generate(
-        prompt->toStdString(),
-        seq_len,
-        [callback](std::string result) { callback->onResult(result); },
-        [callback](const Stats& result) { callback->onStats(result); });
+    if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      auto image_size = image->size();
+      std::vector<Image> images;
+      if (image_size != 0) {
+        std::vector<jint> image_data_jint(image_size);
+        std::vector<uint8_t> image_data(image_size);
+        image->getRegion(0, image_size, image_data_jint.data());
+        for (int i = 0; i < image_size; i++) {
+          image_data[i] = image_data_jint[i];
+        }
+        Image image_runner{image_data, width, height, channels};
+        images.push_back(image_runner);
+      }
+      multi_modal_runner_->generate(
+          images,
+          prompt->toStdString(),
+          seq_len,
+          [callback](std::string result) { callback->onResult(result); },
+          [callback](const Stats& result) { callback->onStats(result); });
+    } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
+      runner_->generate(
+          prompt->toStdString(),
+          seq_len,
+          [callback](std::string result) { callback->onResult(result); },
+          [callback](const Stats& result) { callback->onStats(result); });
+    }
     return 0;
   }
 
   void stop() {
-    runner_->stop();
+    if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      multi_modal_runner_->stop();
+    } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
+      runner_->stop();
+    }
   }
 
   jint load() {
-    return static_cast<jint>(runner_->load());
+    if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      return static_cast<jint>(multi_modal_runner_->load());
+    } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
+      return static_cast<jint>(runner_->load());
+    }
+    return static_cast<jint>(Error::InvalidArgument);
   }
 
   static void registerNatives() {
diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
@@ -14,6 +14,10 @@
 import com.facebook.soloader.nativeloader.SystemDelegate;
 
 public class LlamaModule {
+
+  public static final int MODEL_TYPE_TEXT = 1;
+  public static final int MODEL_TYPE_TEXT_VISION = 2;
+
   static {
     if (!NativeLoader.isInitialized()) {
       NativeLoader.init(new SystemDelegate());
@@ -26,11 +30,16 @@ public class LlamaModule {
 
   @DoNotStrip
   private static native HybridData initHybrid(
-      String modulePath, String tokenizerPath, float temperature);
+      int modelType, String modulePath, String tokenizerPath, float temperature);
 
   /** Constructs a LLAMA Module for a model with given path, tokenizer, and temperature. */
   public LlamaModule(String modulePath, String tokenizerPath, float temperature) {
-    mHybridData = initHybrid(modulePath, tokenizerPath, temperature);
+    mHybridData = initHybrid(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature);
+  }
+
+  /** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */
+  public LlamaModule(int modelType, String modulePath, String tokenizerPath, float temperature) {
+    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature);
   }
 
   public void resetNative() {
@@ -54,8 +63,30 @@ public int generate(String prompt, LlamaCallback llamaCallback) {
    * @param seqLen sequence length
    * @param llamaCallback callback object to receive results.
    */
+  public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) {
+    return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback);
+  }
+
+  /**
+   * Start generating tokens from the module.
+   *
+   * @param image Input image as a byte array
+   * @param width Input image width
+   * @param height Input image height
+   * @param channels Input image number of channels
+   * @param prompt Input prompt
+   * @param seqLen sequence length
+   * @param llamaCallback callback object to receive results.
+   */
   @DoNotStrip
-  public native int generate(String prompt, int seqLen, LlamaCallback llamaCallback);
+  public native int generate(
+      int[] image,
+      int width,
+      int height,
+      int channels,
+      String prompt,
+      int seqLen,
+      LlamaCallback llamaCallback);
 
   /** Stop current generate() before it finishes. */
   @DoNotStrip
diff --git a/extension/llm/runner/image_prefiller.h b/extension/llm/runner/image_prefiller.h
@@ -32,6 +32,8 @@ class ImagePrefiller {
   virtual Error load() = 0;
   virtual bool is_method_loaded() = 0;
 
+  virtual ~ImagePrefiller() = default;
+
  protected:
   Module* module_;
 };
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
@@ -65,6 +65,8 @@ class MultimodalRunner {
     text_token_generator_->stop();
   }
 
+  virtual ~MultimodalRunner() = default;
+
  protected:
   // metadata
   int32_t vocab_size_;
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
@@ -59,6 +59,17 @@ def define_common_targets():
             ],
         )
 
+        runtime.cxx_library(
+            name = "image_prefiller" + aten_suffix,
+            exported_headers = ["image_prefiller.h", "image.h"],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                "//executorch/extension/module:module" + aten_suffix,
+            ],
+        )
+
         runtime.cxx_library(
             name = "metadata_util" + aten_suffix,
             exported_headers = ["metadata_util.h"],
@@ -73,14 +84,13 @@ def define_common_targets():
         runtime.cxx_library(
             name = "runner_lib" + aten_suffix,
             exported_headers = [
-                "image_prefiller.h",
-                "image.h",
                 "multimodal_runner.h",
             ],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
             exported_deps = [
+                ":image_prefiller" + aten_suffix,
                 ":text_decoder_runner" + aten_suffix,
                 ":text_prefiller" + aten_suffix,
                 ":text_token_generator" + aten_suffix,

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,8 @@ class MultimodalRunner {`
`65`	`65`	`text_token_generator_->stop();`
`66`	`66`	`}`
`67`	`67`
	`68`	`+ virtual ~MultimodalRunner() = default;`
	`69`	`+`
`68`	`70`	`protected:`
`69`	`71`	`// metadata`
`70`	`72`	`int32_t vocab_size_;`