Merge branch 'main' into export-D83863515

mkillianey · web-flow · commit f081a81d2fce · 2025-10-14T15:16:26.000-07:00
diff --git a/.ci/scripts/test-cuda-build.sh b/.ci/scripts/test-cuda-build.sh
@@ -27,9 +27,6 @@ test_executorch_cuda_build() {
     nvcc --version || echo "nvcc not found"
     nvidia-smi || echo "nvidia-smi not found"
 
-    # Set CMAKE_ARGS to enable CUDA build - ExecuTorch will handle PyTorch installation automatically
-    export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
-
     echo "=== Starting ExecuTorch Installation ==="
     # Install ExecuTorch with CUDA support with timeout and error handling
     timeout 5400 ./install_executorch.sh || {
diff --git a/.ci/scripts/test_llama_lora.sh b/.ci/scripts/test_llama_lora.sh
@@ -94,7 +94,7 @@ else
   exit 1
 fi
 
-# Export LoRA PTE, PTD file.
+# Export LoRA PTE, foundation PTD file.
 MODEL_SEPARATE="${MODEL_NAME}_separate"
 $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
     base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
@@ -114,20 +114,62 @@ $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
 NOW=$(date +"%H:%M:%S")
 echo "Starting to run llama runner at ${NOW}"
 # shellcheck source=/dev/null
-cmake-out/examples/models/llama/llama_main --model_path=${MODEL_SEPARATE}.pte --data_path=${MODEL_SEPARATE}.ptd --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
+cmake-out/examples/models/llama/llama_main --model_path=${MODEL_SEPARATE}.pte --data_paths=${MODEL_SEPARATE}.ptd --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
 NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 RESULT2=$(cat result2.txt)
 if [[ "${RESULT2}" == "${EXPECTED_PREFIX}"* ]]; then
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT2}"
+  # Do not clean up files if test passes, as they're re-used in the next test.
   echo "Success"
-  cleanup_files
 else
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT2}"
   echo "Failure; results not the same"
   cleanup_files
   exit 1
 fi
+
+# Export LoRA PTE, LoRA PTD, foundation PTD file.
+MODEL_PROGRAM_ONLY="${MODEL_NAME}_program"
+MODEL_LORA_WEIGHTS="lora_weights"
+MODEL_FOUNDATION_WEIGHTS="foundation_weights"
+$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
+    base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+    base.params="${DOWNLOADED_PATH}/params.json" \
+    base.adapter_checkpoint="${DOWNLOADED_PATH}/adapter_model.pt" \
+    base.adapter_config="${DOWNLOADED_PATH}/adapter_config.json" \
+    base.tokenizer_path="${DOWNLOADED_PATH}/tokenizer.model" \
+    model.use_kv_cache=true \
+    model.use_sdpa_with_kv_cache=true \
+    model.dtype_override="fp32" \
+    backend.xnnpack.enabled=true \
+    backend.xnnpack.extended_ops=true \
+    export.output_name="${MODEL_PROGRAM_ONLY}.pte" \
+    export.foundation_weights_file="${MODEL_FOUNDATION_WEIGHTS}.ptd" \
+    export.lora_weights_file="${MODEL_LORA_WEIGHTS}.ptd"
+
+# Run llama runner.
+NOW=$(date +"%H:%M:%S")
+echo "Starting to run llama runner at ${NOW}"
+# shellcheck source=/dev/null
+cmake-out/examples/models/llama/llama_main --model_path=${MODEL_PROGRAM_ONLY}.pte --data_paths="${MODEL_FOUNDATION_WEIGHTS}.ptd,${MODEL_LORA_WEIGHTS}.ptd" --prompt="${PROMPT}" ${RUNTIME_ARGS} > result3.txt
+NOW=$(date +"%H:%M:%S")
+echo "Finished at ${NOW}"
+
+RESULT3=$(cat result3.txt)
+if [[ "${RESULT3}" == "${EXPECTED_PREFIX}"* ]]; then
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT3}"
+  echo "Success"
+else
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT3}"
+  echo "Failure; results not the same"
+  cleanup_files
+  exit 1
+fi
+
+cleanup_files
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -1,7 +1,7 @@
 # Test ExecuTorch CUDA Build Compatibility
 # This workflow tests whether ExecuTorch can be successfully built with CUDA support
 # across different CUDA versions (12.6, 12.8, 12.9) using the command:
-# CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+#   ./install_executorch.sh
 #
 # Note: ExecuTorch automatically detects the system CUDA version using nvcc and
 # installs the appropriate PyTorch wheel. No manual CUDA/PyTorch installation needed.
@@ -43,7 +43,7 @@ jobs:
         set -eux
 
         # Test ExecuTorch CUDA build - ExecuTorch will automatically detect CUDA version
-        # and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
+        # and install the appropriate PyTorch wheel
         source .ci/scripts/test-cuda-build.sh "${{ matrix.cuda-version }}"
 
   # This job will fail if any of the CUDA versions fail
@@ -83,7 +83,7 @@ jobs:
       script: |
         set -eux
 
-        PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        PYTHON_EXECUTABLE=python ./install_executorch.sh
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
@@ -110,7 +110,7 @@ jobs:
         set -eux
 
         echo "::group::Setup ExecuTorch"
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        ./install_executorch.sh
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"
diff --git a/README.md b/README.md
@@ -104,14 +104,16 @@ outputs = method.execute([torch.randn(1, 3, 224, 224)])
 
 Module module("model.pte");
 auto tensor = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
-auto outputs = module.forward({tensor});
+auto outputs = module.forward(tensor);
 ```
 
 **[Swift (iOS)](https://docs.pytorch.org/executorch/main/ios-section.html)**
 ```swift
+import ExecuTorch
+
 let module = Module(filePath: "model.pte")
-let input = Tensor<Float>([1.0, 2.0, 3.0, 4.0])
-let outputs: [Value] = try module.forward([input])
+let input = Tensor<Float>([1.0, 2.0, 3.0, 4.0], shape: [2, 2])
+let outputs = try module.forward(input)
 ```
 
 **[Kotlin (Android)](https://docs.pytorch.org/executorch/main/android-section.html)**
@@ -151,6 +153,8 @@ runner->generate("Hello, how are you?", config);
 
 **[Swift (iOS)](https://docs.pytorch.org/executorch/main/llm/run-on-ios.html)**
 ```swift
+import ExecuTorchLLM
+
 let runner = TextRunner(modelPath: "llama.pte", tokenizerPath: "tiktoken.bin")
 try runner.generate("Hello, how are you?", Config {
     $0.sequenceLength = 128
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -1136,20 +1136,15 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
         llm_config.backend.xnnpack.enabled = True
 
     if llm_config.backend.xnnpack.enabled:
-        if llm_config.export.foundation_weights_file is not None:
-            if llm_config.export.lora_weights_file is not None:
-                gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
-                    llm_config.export.foundation_weights_file
-                    if "lora" not in x.name
-                    else None
-                )
-            else:
-                gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
-                    llm_config.export.foundation_weights_file
-                    if "lora" not in x.name
-                    else llm_config.export.lora_weights_file
-                )
-
+        if (
+            llm_config.export.foundation_weights_file is not None
+            or llm_config.export.lora_weights_file is not None
+        ):
+            gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
+                llm_config.export.foundation_weights_file
+                if "lora" not in x.name
+                else llm_config.export.lora_weights_file
+            )
             from executorch.exir.passes.external_constants_pass import (
                 delegate_external_constants_pass_unlifted,
                 external_constants_pass,
diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp
@@ -8,6 +8,8 @@
  */
 
 #include <gflags/gflags.h>
+#include <sstream>
+#include <vector>
 
 #include <executorch/examples/models/llama/runner/runner.h>
 
@@ -21,7 +23,10 @@ DEFINE_string(
     "llama2.pte",
     "Model serialized in flatbuffer format.");
 
-DEFINE_string(data_path, "", "Data file for the model.");
+DEFINE_string(
+    data_paths,
+    "",
+    "Data files for the model. If multiple files are provided, they should be comma separated.");
 
 DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
 
@@ -54,6 +59,26 @@ DEFINE_int32(
 
 DEFINE_bool(warmup, false, "Whether to run a warmup run.");
 
+// Helper function to parse comma-separated string lists
+std::vector<std::string> parseStringList(const std::string& input) {
+  std::vector<std::string> result;
+  if (input.empty()) {
+    return result;
+  }
+
+  std::stringstream ss(input);
+  std::string item;
+  while (std::getline(ss, item, ',')) {
+    // Trim whitespace
+    item.erase(0, item.find_first_not_of(" \t"));
+    item.erase(item.find_last_not_of(" \t") + 1);
+    if (!item.empty()) {
+      result.push_back(item);
+    }
+  }
+  return result;
+}
+
 int32_t main(int32_t argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
@@ -62,10 +87,7 @@ int32_t main(int32_t argc, char** argv) {
   // and users can create their own DataLoaders to load from arbitrary sources.
   const char* model_path = FLAGS_model_path.c_str();
 
-  std::optional<std::string> data_path = std::nullopt;
-  if (!FLAGS_data_path.empty()) {
-    data_path = FLAGS_data_path.c_str();
-  }
+  std::vector<std::string> data_paths = parseStringList(FLAGS_data_paths);
 
   const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
 
@@ -92,7 +114,7 @@ int32_t main(int32_t argc, char** argv) {
 #endif
   // create llama runner
   std::unique_ptr<::executorch::extension::llm::TextLLMRunner> runner =
-      example::create_llama_runner(model_path, tokenizer_path, data_path);
+      example::create_llama_runner(model_path, tokenizer_path, data_paths);
 
   if (runner == nullptr) {
     ET_LOG(Error, "Failed to create llama runner");
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
@@ -207,6 +207,29 @@ public long prefillImages(int[] image, int width, int height, int channels) {
 
   private native int appendImagesInput(int[] image, int width, int height, int channels);
 
+  /**
+   * Prefill an LLaVA Module with the given images input.
+   *
+   * @param image Input normalized image as a float array
+   * @param width Input image width
+   * @param height Input image height
+   * @param channels Input image number of channels
+   * @return 0, as the updated starting position in KV cache of the input in the LLM is no longer
+   *     exposed to user.
+   * @throws RuntimeException if the prefill failed
+   */
+  @Deprecated
+  public long prefillImages(float[] image, int width, int height, int channels) {
+    int nativeResult = appendNormalizedImagesInput(image, width, height, channels);
+    if (nativeResult != 0) {
+      throw new RuntimeException("Prefill failed with error code: " + nativeResult);
+    }
+    return 0;
+  }
+
+  private native int appendNormalizedImagesInput(
+      float[] image, int width, int height, int channels);
+
   /**
    * Prefill an LLaVA Module with the given text input.
    *
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
@@ -289,6 +289,32 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
     return 0;
   }
 
+  // Returns status_code
+  jint append_normalized_images_input(
+      facebook::jni::alias_ref<jfloatArray> image,
+      jint width,
+      jint height,
+      jint channels) {
+    std::vector<llm::Image> images;
+    if (image == nullptr) {
+      return static_cast<jint>(Error::EndOfMethod);
+    }
+    auto image_size = image->size();
+    if (image_size != 0) {
+      std::vector<jfloat> image_data_jfloat(image_size);
+      std::vector<float> image_data(image_size);
+      image->getRegion(0, image_size, image_data_jfloat.data());
+      for (int i = 0; i < image_size; i++) {
+        image_data[i] = image_data_jfloat[i];
+      }
+      llm::Image image_runner{std::move(image_data), width, height, channels};
+      prefill_inputs_.emplace_back(
+          llm::MultimodalInput{std::move(image_runner)});
+    }
+
+    return 0;
+  }
+
   void stop() {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
       multi_modal_runner_->stop();
@@ -323,6 +349,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
         makeNativeMethod("load", ExecuTorchLlmJni::load),
         makeNativeMethod(
             "appendImagesInput", ExecuTorchLlmJni::append_images_input),
+        makeNativeMethod(
+            "appendNormalizedImagesInput",
+            ExecuTorchLlmJni::append_normalized_images_input),
         makeNativeMethod(
             "appendTextInput", ExecuTorchLlmJni::append_text_input),
         makeNativeMethod("resetContext", ExecuTorchLlmJni::reset_context),
diff --git a/install_utils.py b/install_utils.py
diff --git a/website/index.html b/website/index.html