strange things

kirklandsign · kirklandsign · commit d020306f9a31 · 2025-09-23T20:57:10.000-07:00
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
@@ -167,7 +167,7 @@ public int generate(
   }
 
   /**
-   * Prefill an LLaVA Module with the given images input.
+   * Prefill an multimodal Module with the given images input.
    *
    * @param image Input image as a byte array
    * @param width Input image width
@@ -189,9 +189,9 @@ public long prefillImages(int[] image, int width, int height, int channels) {
   private native int appendImagesInput(int[] image, int width, int height, int channels);
 
   /**
-   * Prefill an LLaVA Module with the given text input.
+   * Prefill an multimodal Module with the given text input.
    *
-   * @param prompt The text prompt to LLaVA.
+   * @param prompt The text prompt to multimodal model.
    * @return 0, as the updated starting position in KV cache of the input in the LLM is no longer
    *     exposed to user.
    * @throws RuntimeException if the prefill failed
@@ -208,6 +208,35 @@ public long prefillPrompt(String prompt) {
   // returns status
   private native int appendTextInput(String prompt);
 
+  /**
+   * Prefill a multimodal Module with the given text input.
+   *
+   * @param prompt The text prompt to multimodal model.
+   * @return 0, as the updated starting position in KV cache of the input in the LLM is no longer
+   *     exposed to user.
+   * @throws RuntimeException if the prefill failed
+   */
+  public int prefillAudio(String filePath) {
+    java.io.File file = new java.io.File(filePath);
+    try (java.io.FileInputStream fis = new java.io.FileInputStream(file)) {
+      byte[] fileBytes = new byte[(int) file.length()];
+      int bytesRead = fis.read(fileBytes);
+      if (bytesRead != fileBytes.length) {
+          throw new RuntimeException("Could not completely read file " + file.getName());
+      }
+      int nFloats = fileBytes.length / 4;
+      int batchSize = nFloats / (128 * 3000);
+      return appendAudioInput(fileBytes, batchSize, 128, 3000);
+    } catch (java.io.IOException e) {
+      throw new RuntimeException("Failed to read file: " + e);
+    }
+  }
+
+  // For Audio (option B), not RawAudio
+  // Use batch_size = ceil(n_floats / (n_bins * n_frames)), n_bins = 128, n_frames = 3000
+  // returns status
+  private native int appendAudioInput(byte[] audio, int batch_size, int n_bins, int n_frames);
+
   /**
    * Reset the context of the LLM. This will clear the KV cache and reset the state of the LLM.
    *
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
@@ -286,10 +286,10 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
   }
 
   jint prefill_audio_input(
-      facebook::jni::alias_ref<jintArray> audio,
+      facebook::jni::alias_ref<jbyteArray> audio,
       jint batch_size,
-      jint n_channels,
-      jint n_samples) {
+      jint n_bins,
+      jint n_frames) {
     if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
       return static_cast<jint>(Error::InvalidArgument);
     }
@@ -299,12 +299,12 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
     auto audio_size = audio->size();
     std::vector<uint8_t> audio_data(audio_size);
     if (audio_size != 0) {
-      std::vector<jint> audio_data_jint(audio_size);
-      audio->getRegion(0, audio_size, audio_data_jint.data());
+      std::vector<jbyte> audio_data_jbyte(audio_size);
+      audio->getRegion(0, audio_size, audio_data_jbyte.data());
       for (int i = 0; i < audio_size; i++) {
-        audio_data[i] = audio_data_jint[i];
+        audio_data[i] = audio_data_jbyte[i];
       }
-      llm::RawAudio audio_input{audio_data, batch_size, n_channels, n_samples};
+      llm::Audio audio_input{std::move(audio_data), batch_size, n_bins, n_frames};
       multi_modal_runner_->prefill(
           {llm::MultimodalInput{std::move(audio_input)}});
     }