pytorch
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/setup-samsung-linux-deps.sh‎
Lines changed: 1 addition & 10 deletions b/‎.ci/scripts/setup-samsung-linux-deps.sh‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎.ci/scripts/setup-windows.ps1‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/setup-windows.ps1‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test-cuda-build.sh‎
Lines changed: 95 additions & 0 deletions b/‎.ci/scripts/test-cuda-build.sh‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎.ci/scripts/test_backend_linux.sh‎
Lines changed: 6 additions & 1 deletion b/‎.ci/scripts/test_backend_linux.sh‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.ci/scripts/test_llava.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_llava.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_model.ps1‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_model.ps1‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_model.sh‎
Lines changed: 3 additions & 3 deletions b/‎.ci/scripts/test_model.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.ci/scripts/test_torchao_huggingface_checkpoints.sh‎
Lines changed: 139 additions & 0 deletions b/‎.ci/scripts/test_torchao_huggingface_checkpoints.sh‎
Lines changed: 139 additions & 0 deletions
@@ -1 +1 @@
-40b02a2dc61bbf901a2df91719f47c98d65368ec
+828ae02053a6e0e20a2dfd6e737ba10c6f4dee6b
@@ -1 +1 @@
-e7152ff8a6a929a0db7f3f4a72a5b6d471769cd3
+53a2908a10f414a2f85caa06703a26a40e873869
@@ -11,7 +11,7 @@ set -ex
 
 download_ai_lite_core() {
   API_BASE="https://soc-developer.semiconductor.samsung.com/api/v1/resource/ai-litecore/download"
-  API_KEY="kn10SoSY3hkC-9Qny5TqD2mnqVrlupv3krnjLeBt5cY"
+  API_KEY=$SAMSUNG_AI_LITECORE_KEY
 
   VERSION="0.5"
   OS_NAME="Ubuntu 22.04"
@@ -54,15 +54,6 @@ install_enn_backend() {
   rm -rf "${NDK_INSTALLATION_DIR}" && sudo mkdir -p "${NDK_INSTALLATION_DIR}"
   ANDROID_NDK_VERSION=r27b
 
-  pushd .
-  cd /tmp
-  curl -Os --retry 3 "https://ossci-android.s3.amazonaws.com/android-ndk-${ANDROID_NDK_VERSION}-linux.zip"
-  unzip -qo "android-ndk-${ANDROID_NDK_VERSION}-linux.zip"
-
-  # Print the content for manual verification
-  ls -lah "android-ndk-${ANDROID_NDK_VERSION}"
-  sudo mv "android-ndk-${ANDROID_NDK_VERSION}"/* "${NDK_INSTALLATION_DIR}"
-  popd
   # build Exynos backend
   export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
   bash backends/samsung/build.sh --build all
 
@@ -1,5 +1,5 @@
 param (
-    [string]$editable = $false
+    [string]$editable = "false"
 )
 
 conda create --yes --quiet -n et python=3.12
 
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+CUDA_VERSION=${1:-"12.6"}
+
+echo "=== Testing ExecutorTorch CUDA ${CUDA_VERSION} Build ==="
+
+# Function to build and test ExecutorTorch with CUDA support
+test_executorch_cuda_build() {
+    local cuda_version=$1
+
+    echo "Building ExecutorTorch with CUDA ${cuda_version} support..."
+    echo "ExecutorTorch will automatically detect CUDA and install appropriate PyTorch wheel"
+
+    # Check available resources before starting
+    echo "=== System Information ==="
+    echo "Available memory: $(free -h | grep Mem | awk '{print $2}')"
+    echo "Available disk space: $(df -h . | tail -1 | awk '{print $4}')"
+    echo "CPU cores: $(nproc)"
+    echo "CUDA version check:"
+    nvcc --version || echo "nvcc not found"
+    nvidia-smi || echo "nvidia-smi not found"
+
+    # Set CMAKE_ARGS to enable CUDA build - ExecutorTorch will handle PyTorch installation automatically
+    export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
+
+    echo "=== Starting ExecutorTorch Installation ==="
+    # Install ExecutorTorch with CUDA support with timeout and error handling
+    timeout 5400 ./install_executorch.sh || {
+        local exit_code=$?
+        echo "ERROR: install_executorch.sh failed with exit code: $exit_code"
+        if [ $exit_code -eq 124 ]; then
+            echo "ERROR: Installation timed out after 90 minutes"
+        fi
+        exit $exit_code
+    }
+
+    echo "SUCCESS: ExecutorTorch CUDA build completed"
+
+    # Verify the installation
+    echo "=== Verifying ExecutorTorch CUDA Installation ==="
+
+    # Test that ExecutorTorch was built successfully
+    python -c "
+import executorch
+print('SUCCESS: ExecutorTorch imported successfully')
+"
+
+    # Test CUDA availability and show details
+    python -c "
+try:
+    import torch
+    print('INFO: PyTorch version:', torch.__version__)
+    print('INFO: CUDA available:', torch.cuda.is_available())
+
+    if torch.cuda.is_available():
+        print('SUCCESS: CUDA is available for ExecutorTorch')
+        print('INFO: CUDA version:', torch.version.cuda)
+        print('INFO: GPU device count:', torch.cuda.device_count())
+        print('INFO: Current GPU device:', torch.cuda.current_device())
+        print('INFO: GPU device name:', torch.cuda.get_device_name())
+
+        # Test basic CUDA tensor operation
+        device = torch.device('cuda')
+        x = torch.randn(10, 10).to(device)
+        y = torch.randn(10, 10).to(device)
+        z = torch.mm(x, y)
+        print('SUCCESS: CUDA tensor operation completed on device:', z.device)
+        print('INFO: Result tensor shape:', z.shape)
+
+        print('SUCCESS: ExecutorTorch CUDA integration verified')
+    else:
+        print('WARNING: CUDA not detected, but ExecutorTorch built successfully')
+        exit(1)
+except Exception as e:
+    print('ERROR: ExecutorTorch CUDA test failed:', e)
+    exit(1)
+"
+
+    echo "SUCCESS: ExecutorTorch CUDA ${cuda_version} build and verification completed successfully"
+}
+
+# Main execution
+echo "Current working directory: $(pwd)"
+echo "Directory contents:"
+ls -la
+
+# Run the CUDA build test
+test_executorch_cuda_build "${CUDA_VERSION}"
@@ -39,12 +39,17 @@ if [[ "$FLOW" == *qnn* ]]; then
 fi
 
 if [[ "$FLOW" == *vulkan* ]]; then
-    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
+    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate.
     source .ci/scripts/setup-vulkan-linux-deps.sh
 
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
 
+if [[ "$FLOW" == *arm* ]]; then
+    # Setup ARM deps.
+    .ci/scripts/setup-arm-baremetal-tools.sh
+fi
+
 # We need the runner to test the built library.
 PYTHON_EXECUTABLE=python CMAKE_ARGS="$EXTRA_BUILD_ARGS" .ci/scripts/setup-linux.sh --build-tool cmake --build-mode Release --editable true
 
 
@@ -149,7 +149,7 @@ run_and_verify() {
 
     # verify result.txt
     RESULT=$(cat result.txt)
-    EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with"
+    EXPECTED_PREFIX="ASSISTANT: The image captures a basketball game in progress, with"
 
     if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
 
@@ -34,7 +34,7 @@ function ExportModel-Xnnpack {
         [bool]$quantize
     )
 
-    if $(quantize) {
+    if ($quantize) {
         python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize | Write-Host
         $modelFile = "$($modelName)_xnnpack_q8.pte"
     } else {
 
@@ -131,13 +131,13 @@ test_model_with_xnnpack() {
     return 0
   fi
 
-  # Delegation
+  # Delegation and test with pybindings
   if [[ ${WITH_QUANTIZATION} == true ]]; then
     SUFFIX="q8"
-    "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize
+    "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize --test_after_export
   else
     SUFFIX="fp32"
-    "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate
+    "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --test_after_export
   fi
 
   OUTPUT_MODEL_PATH="${MODEL_NAME}_xnnpack_${SUFFIX}.pte"
 
@@ -0,0 +1,139 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# -------------------------
+# Args / flags
+# -------------------------
+TEST_WITH_RUNNER=0
+MODEL_NAME=""
+
+# Parse args
+if [[ $# -lt 1 ]]; then
+  echo "Usage: $0 <model_name> [--test_with_runner]"
+  echo "Supported model_name values: qwen3_4b, phi_4_mini"
+  exit 1
+fi
+
+MODEL_NAME="$1"
+shift
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --test_with_runner)
+      TEST_WITH_RUNNER=1
+      ;;
+    -h|--help)
+      echo "Usage: $0 <model_name> [--test_with_runner]"
+      echo "  model_name: qwen3_4b | phi_4_mini"
+      echo "  --test_with_runner: build ET + run llama_main to sanity-check the export"
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1"
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+MODEL_OUT=model.pte
+
+case "$MODEL_NAME" in
+  qwen3_4b)
+    echo "Running Qwen3-4B export..."
+    HF_MODEL_DIR=$(hf download pytorch/Qwen3-4B-INT8-INT4)
+    EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
+    $PYTHON_EXECUTABLE -m executorch.examples.models.qwen3.convert_weights \
+      $HF_MODEL_DIR \
+      pytorch_model_converted.bin
+
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
+      --model "qwen3_4b" \
+      --checkpoint pytorch_model_converted.bin \
+      --params examples/models/qwen3/config/4b_config.json \
+      --output_name $MODEL_OUT \
+      -kv \
+      --use_sdpa_with_kv_cache \
+      -X \
+      --xnnpack-extended-ops \
+      --max_context_length 1024 \
+      --max_seq_length 1024 \
+      --dtype fp32 \
+      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+    ;;
+
+  phi_4_mini)
+    echo "Running Phi-4-mini export..."
+    HF_MODEL_DIR=$(hf download pytorch/Phi-4-mini-instruct-INT8-INT4)
+    EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
+    $PYTHON_EXECUTABLE -m executorch.examples.models.phi_4_mini.convert_weights \
+      $HF_MODEL_DIR \
+      pytorch_model_converted.bin
+
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
+      --model "phi_4_mini" \
+      --checkpoint pytorch_model_converted.bin \
+      --params examples/models/phi_4_mini/config/config.json \
+      --output_name $MODEL_OUT \
+      -kv \
+      --use_sdpa_with_kv_cache \
+      -X \
+      --xnnpack-extended-ops \
+      --max_context_length 1024 \
+      --max_seq_length 1024 \
+      --dtype fp32 \
+      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+    ;;
+
+  *)
+    echo "Error: unsupported model_name '$MODEL_NAME'"
+    echo "Supported values: qwen3_4b, phi_4_mini"
+    exit 1
+    ;;
+esac
+
+# Check file size
+MODEL_SIZE=$(stat --printf="%s" $MODEL_OUT 2>/dev/null || stat -f%z $MODEL_OUT)
+if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then
+  echo "Error: model size $MODEL_SIZE is greater than expected upper bound $EXPECTED_MODEL_SIZE_UPPER_BOUND"
+  exit 1
+fi
+
+# Install ET with CMake
+if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
+  echo "[runner] Building and testing llama_main ..."
+    cmake -DPYTHON_EXECUTABLE=python \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DEXECUTORCH_ENABLE_LOGGING=1 \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
+        -Bcmake-out .
+    cmake --build cmake-out -j16 --config Release --target install
+
+
+    # Install llama runner
+    cmake -DPYTHON_EXECUTABLE=python \
+        -DCMAKE_BUILD_TYPE=Release \
+        -Bcmake-out/examples/models/llama \
+        examples/models/llama
+    cmake --build cmake-out/examples/models/llama -j16 --config Release
+
+    # Run the model
+    ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path="${HF_MODEL_DIR}/tokenizer.json" --prompt="Once upon a time,"
+fi
+
+# Clean up
+rm -f pytorch_model_converted.bin "$MODEL_OUT"
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-40b02a2dc61bbf901a2df91719f47c98d65368ec`
	`1`	`+828ae02053a6e0e20a2dfd6e737ba10c6f4dee6b`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e7152ff8a6a929a0db7f3f4a72a5b6d471769cd3`
	`1`	`+53a2908a10f414a2f85caa06703a26a40e873869`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`param (`
`2`		`- [string]$editable = $false`
	`2`	`+ [string]$editable = "false"`
`3`	`3`	`)`
`4`	`4`
`5`	`5`	`conda create --yes --quiet -n et python=3.12`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ function ExportModel-Xnnpack {`
`34`	`34`	`[bool]$quantize`
`35`	`35`	`)`
`36`	`36`
`37`		`- if $(quantize) {`
	`37`	`+ if ($quantize) {`
`38`	`38`	`python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize \| Write-Host`
`39`	`39`	`$modelFile = "$($modelName)_xnnpack_q8.pte"`
`40`	`40`	`} else {`