pytorch
diff --git a/‎.ci/scripts/test-cuda-build.sh‎
Lines changed: 0 additions & 3 deletions b/‎.ci/scripts/test-cuda-build.sh‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎.ci/scripts/test_llama_lora.sh‎
Lines changed: 45 additions & 3 deletions b/‎.ci/scripts/test_llama_lora.sh‎
Lines changed: 45 additions & 3 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/lint.yml‎
Lines changed: 22 additions & 13 deletions b/‎.github/workflows/lint.yml‎
Lines changed: 22 additions & 13 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 3 deletions b/‎README.md‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎backends/aoti/common_shims.cpp‎
Lines changed: 12 additions & 0 deletions b/‎backends/aoti/common_shims.cpp‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/aoti/common_shims.h‎
Lines changed: 3 additions & 0 deletions b/‎backends/aoti/common_shims.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/aoti/utils.h‎
Lines changed: 6 additions & 0 deletions b/‎backends/aoti/utils.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/apple/metal/README.md‎
Lines changed: 5 additions & 0 deletions b/‎backends/apple/metal/README.md‎
Lines changed: 5 additions & 0 deletions
@@ -27,9 +27,6 @@ test_executorch_cuda_build() {
     nvcc --version || echo "nvcc not found"
     nvidia-smi || echo "nvidia-smi not found"
 
-    # Set CMAKE_ARGS to enable CUDA build - ExecuTorch will handle PyTorch installation automatically
-    export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
-
     echo "=== Starting ExecuTorch Installation ==="
     # Install ExecuTorch with CUDA support with timeout and error handling
     timeout 5400 ./install_executorch.sh || {
 
@@ -94,7 +94,7 @@ else
   exit 1
 fi
 
-# Export LoRA PTE, PTD file.
+# Export LoRA PTE, foundation PTD file.
 MODEL_SEPARATE="${MODEL_NAME}_separate"
 $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
     base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
@@ -114,20 +114,62 @@ $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
 NOW=$(date +"%H:%M:%S")
 echo "Starting to run llama runner at ${NOW}"
 # shellcheck source=/dev/null
-cmake-out/examples/models/llama/llama_main --model_path=${MODEL_SEPARATE}.pte --data_path=${MODEL_SEPARATE}.ptd --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
+cmake-out/examples/models/llama/llama_main --model_path=${MODEL_SEPARATE}.pte --data_paths=${MODEL_SEPARATE}.ptd --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
 NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 RESULT2=$(cat result2.txt)
 if [[ "${RESULT2}" == "${EXPECTED_PREFIX}"* ]]; then
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT2}"
+  # Do not clean up files if test passes, as they're re-used in the next test.
   echo "Success"
-  cleanup_files
 else
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT2}"
   echo "Failure; results not the same"
   cleanup_files
   exit 1
 fi
+
+# Export LoRA PTE, LoRA PTD, foundation PTD file.
+MODEL_PROGRAM_ONLY="${MODEL_NAME}_program"
+MODEL_LORA_WEIGHTS="lora_weights"
+MODEL_FOUNDATION_WEIGHTS="foundation_weights"
+$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
+    base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+    base.params="${DOWNLOADED_PATH}/params.json" \
+    base.adapter_checkpoint="${DOWNLOADED_PATH}/adapter_model.pt" \
+    base.adapter_config="${DOWNLOADED_PATH}/adapter_config.json" \
+    base.tokenizer_path="${DOWNLOADED_PATH}/tokenizer.model" \
+    model.use_kv_cache=true \
+    model.use_sdpa_with_kv_cache=true \
+    model.dtype_override="fp32" \
+    backend.xnnpack.enabled=true \
+    backend.xnnpack.extended_ops=true \
+    export.output_name="${MODEL_PROGRAM_ONLY}.pte" \
+    export.foundation_weights_file="${MODEL_FOUNDATION_WEIGHTS}.ptd" \
+    export.lora_weights_file="${MODEL_LORA_WEIGHTS}.ptd"
+
+# Run llama runner.
+NOW=$(date +"%H:%M:%S")
+echo "Starting to run llama runner at ${NOW}"
+# shellcheck source=/dev/null
+cmake-out/examples/models/llama/llama_main --model_path=${MODEL_PROGRAM_ONLY}.pte --data_paths="${MODEL_FOUNDATION_WEIGHTS}.ptd,${MODEL_LORA_WEIGHTS}.ptd" --prompt="${PROMPT}" ${RUNTIME_ARGS} > result3.txt
+NOW=$(date +"%H:%M:%S")
+echo "Finished at ${NOW}"
+
+RESULT3=$(cat result3.txt)
+if [[ "${RESULT3}" == "${EXPECTED_PREFIX}"* ]]; then
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT3}"
+  echo "Success"
+else
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT3}"
+  echo "Failure; results not the same"
+  cleanup_files
+  exit 1
+fi
+
+cleanup_files
@@ -1,7 +1,7 @@
 # Test ExecuTorch CUDA Build Compatibility
 # This workflow tests whether ExecuTorch can be successfully built with CUDA support
 # across different CUDA versions (12.6, 12.8, 12.9) using the command:
-# CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+#   ./install_executorch.sh
 #
 # Note: ExecuTorch automatically detects the system CUDA version using nvcc and
 # installs the appropriate PyTorch wheel. No manual CUDA/PyTorch installation needed.
@@ -43,7 +43,7 @@ jobs:
         set -eux
 
         # Test ExecuTorch CUDA build - ExecuTorch will automatically detect CUDA version
-        # and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
+        # and install the appropriate PyTorch wheel
         source .ci/scripts/test-cuda-build.sh "${{ matrix.cuda-version }}"
 
   # This job will fail if any of the CUDA versions fail
@@ -71,7 +71,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: [linear, add, add_mul, resnet18]
+        model: [linear, add, add_mul, resnet18, conv1d]
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -83,7 +83,7 @@ jobs:
       script: |
         set -eux
 
-        PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        PYTHON_EXECUTABLE=python ./install_executorch.sh
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
@@ -110,7 +110,7 @@ jobs:
         set -eux
 
         echo "::group::Setup ExecuTorch"
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        ./install_executorch.sh
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"
 
@@ -143,19 +143,28 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
-        FILES_NEEDS_FORMAT=$(/opt/google-java-format -n \
-          extension/android/executorch_android/src/main/java/org/pytorch/executorch/*.java \
-          extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/*.java \
-          extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/*.java \
-          extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/*.java \
-          extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java \
-          extension/benchmark/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/*.java)
+        FILES_NEEDS_FORMAT=$(find extension/android/executorch_android/src/main/java/org/pytorch/executorch \
+                            extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm \
+                            extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations \
+                            extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch \
+                            extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench \
+                            extension/benchmark/android/benchmark/app/src/androidTest/java/org/pytorch/minibench \
+                            -type f -name "*.java" 2>/dev/null | \
+                            xargs -r /opt/google-java-format -n)
+
         if [ -n "$FILES_NEEDS_FORMAT" ]; then
-          echo "Warning: The following files need formatting. Please use google-java-format."
-          echo "Use a binary from https://github.com/google/google-java-format/releases/"
-          echo "For example:"
-          echo "wget https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64"
-          echo "chmod +x google-java-format_linux-x86-64"
-          echo "./google-java-format_linux-x86-64 -i $FILES_NEEDS_FORMAT"
+          echo "Warning: The following files need formatting:"
+          echo "$FILES_NEEDS_FORMAT"
+          echo ""
+          echo "Please use google-java-format from https://github.com/google/google-java-format/releases/"
+          echo ""
+          echo "To fix, run one of these commands:"
+          echo "  # Using xargs (recommended):"
+          echo "  find <paths> -type f -name '*.java' | xargs google-java-format -i"
+          echo ""
+          echo "  # Or format specific files:"
+          echo "$FILES_NEEDS_FORMAT" | while IFS= read -r file; do
+            echo "  google-java-format -i \"$file\""
+          done
           exit 1
         fi
@@ -104,14 +104,16 @@ outputs = method.execute([torch.randn(1, 3, 224, 224)])
 
 Module module("model.pte");
 auto tensor = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
-auto outputs = module.forward({tensor});
+auto outputs = module.forward(tensor);
 ```
 
 **[Swift (iOS)](https://docs.pytorch.org/executorch/main/ios-section.html)**
 ```swift
+import ExecuTorch
+
 let module = Module(filePath: "model.pte")
-let input = Tensor<Float>([1.0, 2.0, 3.0, 4.0])
-let outputs: [Value] = try module.forward([input])
+let input = Tensor<Float>([1.0, 2.0, 3.0, 4.0], shape: [2, 2])
+let outputs = try module.forward(input)
 ```
 
 **[Kotlin (Android)](https://docs.pytorch.org/executorch/main/android-section.html)**
@@ -151,6 +153,8 @@ runner->generate("Hello, how are you?", config);
 
 **[Swift (iOS)](https://docs.pytorch.org/executorch/main/llm/run-on-ios.html)**
 ```swift
+import ExecuTorchLLM
+
 let runner = TextRunner(modelPath: "llama.pte", tokenizerPath: "tiktoken.bin")
 try runner.generate("Hello, how are you?", Config {
     $0.sequenceLength = 128
 
@@ -172,6 +172,18 @@ int32_t aoti_torch_dtype_bfloat16() {
   return 15; // PyTorch's bfloat16 dtype code
 }
 
+int32_t aoti_torch_dtype_int8() {
+  return 1; // PyTorch's int32 dtype code
+}
+
+int32_t aoti_torch_dtype_int16() {
+  return 2; // PyTorch's int32 dtype code
+}
+
+int32_t aoti_torch_dtype_int32() {
+  return 3; // PyTorch's int32 dtype code
+}
+
 int32_t aoti_torch_dtype_int64() {
   return 4; // PyTorch's int64 dtype code
 }
 
@@ -59,6 +59,9 @@ int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_layout_strided();
 int32_t aoti_torch_dtype_float32();
 int32_t aoti_torch_dtype_bfloat16();
+int32_t aoti_torch_dtype_int8();
+int32_t aoti_torch_dtype_int16();
+int32_t aoti_torch_dtype_int32();
 int32_t aoti_torch_dtype_int64();
 
 // Dtype utility function needed by Metal backend
 
@@ -34,6 +34,12 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
   // Convert based on known PyTorch dtype codes (without CUDA-specific
   // dependency)
   switch (dtype) {
+    case 1: // PyTorch's int8 dtype code
+      return executorch::aten::ScalarType::Char;
+    case 2: // PyTorch's int16 dtype code
+      return executorch::aten::ScalarType::Short;
+    case 3: // PyTorch's int32 dtype code
+      return executorch::aten::ScalarType::Int;
     case 4: // PyTorch's int64 dtype code
       return executorch::aten::ScalarType::Long;
     case 6: // PyTorch's float32 dtype code
 
@@ -0,0 +1,5 @@
+# Metal Backend
+
+⚠️ **EXPERIMENTAL BACKEND**
+
+This backend is currently in experimental development and may not be fully functional or stable. Use with caution.