diff --git a/.github/workflows/android-perf-private-device-experiment.yml b/.github/workflows/android-perf-private-device-experiment.yml
deleted file mode 100644
index cf37538f620..00000000000
--- a/.github/workflows/android-perf-private-device-experiment.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-name: android-perf (private devices)
-
-on:
-  schedule:
-    - cron: 0 0,4,8,12,16,20 * * *
-  pull_request:
-    paths:
-      - .github/workflows/android-perf-private-device-experiment.yml
-  push:
-    branches:
-      - main
-    paths:
-      - .github/workflows/android-perf-private-device-experiment.yml
-  # Note: GitHub has an upper limit of 10 inputs
-  workflow_dispatch:
-    inputs:
-      models:
-        description: Models to be benchmarked
-        required: false
-        type: string
-        default: Qwen/Qwen3-0.6B
-      devices:
-        description: Target devices to run benchmark
-        required: false
-        type: string
-        default: samsung_galaxy_s22+private
-      benchmark_configs:
-        description: The list of configs used the benchmark
-        required: false
-        type: string
-  workflow_call:
-    inputs:
-      models:
-        description: Models to be benchmarked
-        required: false
-        type: string
-        default: Qwen/Qwen3-0.6B
-      devices:
-        description: Target devices to run benchmark
-        required: false
-        type: string
-        default: samsung_galaxy_s22+private
-      benchmark_configs:
-        description: The list of configs used the benchmark
-        required: false
-        type: string
-
-concurrency:
-  group: android-perf-private-devices-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-jobs:
-  android:
-    uses: ./.github/workflows/android-perf.yml
-    secrets: inherit
-    permissions:
-      id-token: write
-      contents: read
-    with:
-      models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
-      devices: samsung_galaxy_s22+private
-      benchmark_configs: ${{ inputs.benchmark_configs }}
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
deleted file mode 100644
index 33937531a01..00000000000
--- a/.github/workflows/android-perf.yml
+++ /dev/null
@@ -1,562 +0,0 @@
-name: android-perf
-
-on:
-  schedule:
-    - cron: 0 0,8,16 * * *
-  pull_request:
-    paths:
-      - .github/workflows/android-perf.yml
-      - .ci/scripts/gather_benchmark_configs.py
-      - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
-  push:
-    branches:
-      - main
-    paths:
-      - .github/workflows/android-perf.yml
-      - .ci/scripts/gather_benchmark_configs.py
-      - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
-  # Note: GitHub has an upper limit of 10 inputs
-  workflow_dispatch:
-    inputs:
-      models:
-        description: Models to be benchmarked
-        required: false
-        type: string
-        default: Qwen/Qwen3-0.6B
-      devices:
-        description: Target devices to run benchmark
-        required: false
-        type: string
-        default: samsung_galaxy_s22+public
-      benchmark_configs:
-        description: The list of configs used the benchmark
-        required: false
-        type: string
-  workflow_call:
-    inputs:
-      models:
-        description: Models to be benchmarked
-        required: false
-        type: string
-        default: Qwen/Qwen3-0.6B
-      devices:
-        description: Target devices to run benchmark
-        required: false
-        type: string
-        default: samsung_galaxy_s22+public
-      benchmark_configs:
-        description: The list of configs used the benchmark
-        required: false
-        type: string
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-jobs:
-  set-parameters:
-    runs-on: ubuntu-22.04
-    outputs:
-      benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: 'false'
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-      - name: Set parameters
-        id: set-parameters
-        shell: bash
-        env:
-          # Separate default values from the workflow dispatch. To ensure defaults are accessible
-          # during scheduled runs and to provide flexibility for different defaults between
-          # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
-          CRON_DEFAULT_DEVICES: samsung_galaxy_s22+public
-        run: |
-          set -eux
-
-          ARGS="--os android"
-
-          MODELS="${{ inputs.models }}"
-          if [ -z "$MODELS" ]; then
-            MODELS="$CRON_DEFAULT_MODELS"
-          fi
-          ARGS="$ARGS --models $MODELS"
-
-          DEVICES="${{ inputs.devices }}"
-          if [ -z "$DEVICES" ]; then
-            DEVICES="$CRON_DEFAULT_DEVICES"
-          fi
-          ARGS="$ARGS --devices $DEVICES"
-
-          BENCHMARK_CONFIGS="${{ inputs.benchmark_configs }}"
-          if [ -n "$BENCHMARK_CONFIGS" ]; then
-            ARGS="$ARGS --configs $BENCHMARK_CONFIGS"
-          fi
-
-          PYTHONPATH="${PWD}" python .ci/scripts/gather_benchmark_configs.py $ARGS
-
-  prepare-test-specs:
-    runs-on: linux.2xlarge
-    needs: set-parameters
-    strategy:
-      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
-      fail-fast: false
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Prepare the spec
-        id: prepare
-        shell: bash
-        env:
-          BENCHMARK_CONFIG: ${{ toJSON(matrix) }}
-        working-directory: extension/benchmark/android/benchmark
-        run: |
-          set -eux
-
-          # The model will be exported in the next step to this S3 path
-          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip"
-          # We could write a script to properly use jinja here, but there is only one variable,
-          # so let's just sed it
-          sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2
-
-          BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g')
-          # The config for this benchmark runs, we save it in the test spec so that it can be fetched
-          # later by the upload script
-          sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' android-llm-device-farm-test-spec.yml.j2
-
-          cp android-llm-device-farm-test-spec.yml.j2 android-llm-device-farm-test-spec.yml
-          # Just print the test spec for debugging
-          cat android-llm-device-farm-test-spec.yml
-
-          # Save the benchmark configs so that we can use it later in the dashboard
-          echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json"
-          echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT
-
-      - name: Upload the spec
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}
-          retention-days: 1
-          if-no-files-found: error
-          path: extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml
-
-      - name: Update the benchmark configs
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
-          retention-days: 1
-          if-no-files-found: error
-          path: extension/benchmark/android/benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json
-
-  export-models:
-    name: export-models
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    needs: set-parameters
-    secrets: inherit
-    strategy:
-      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
-      fail-fast: false
-    with:
-      runner: linux.2xlarge.memory
-      docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk
-      submodules: 'recursive'
-      timeout: 60
-      upload-artifact: android-models
-      upload-artifact-to-s3: true
-      secrets-env: EXECUTORCH_HF_TOKEN
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        echo "::group::Setting up dev environment"
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-        if [[ ${{ matrix.config }} == *"qnn"* ]]; then
-            PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
-            PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
-        fi
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
-        # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
-
-        pip install -U "huggingface_hub[cli]"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-        pip install accelerate sentencepiece
-        pip list
-
-        ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.config }}
-        echo "::endgroup::"
-
-        echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}"
-        BUILD_MODE="cmake"
-
-        if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
-            # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
-            HF_MODEL_REPO=${{ matrix.model }}
-            OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
-
-            # Convert HF checkpoint to ET via etLLM path
-            if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
-                if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
-                    # SpinQuant
-                    # Download prequantized chceckpoint from Hugging Face
-                    DOWNLOADED_PATH=$(
-                      bash .ci/scripts/download_hf_hub.sh \
-                        --model_id "${HF_MODEL_REPO}" \
-                        --files "tokenizer.model" "params.json" "consolidated.00.pth"
-                    )
-                    # Export using ExecuTorch's model definition
-                    python -m extension.llm.export.export_llm \
-                      base.model_class="llama3_2" \
-                      base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
-                      base.params="${DOWNLOADED_PATH}/params.json" \
-                      model.use_sdpa_with_kv_cache=true \
-                      backend.xnnpack.enabled=true \
-                      backend.xnnpack.extended_ops=true \
-                      base.preq_mode="preq_8da4w_out_8da8w" \
-                      base.preq_group_size=32 \
-                      export.max_seq_length=2048 \
-                      export.max_context_length=2048 \
-                      export.output_name="${OUT_ET_MODEL_NAME}.pte" \
-                      model.use_kv_cache=true \
-                      model.dtype_override=fp32 \
-                      base.preq_embedding_quantize=\'8,0\' \
-                      quantization.use_spin_quant=native \
-                      base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
-                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
-                    # QAT + LoRA
-                    # Download prequantized chceckpoint from Hugging Face
-                    DOWNLOADED_PATH=$(
-                      bash .ci/scripts/download_hf_hub.sh \
-                        --model_id "${HF_MODEL_REPO}" \
-                        --files "tokenizer.model" "params.json" "consolidated.00.pth"
-                    )
-                    # Export using ExecuTorch's model definition
-                    python -m extension.llm.export.export_llm \
-                      base.model_class="llama3_2" \
-                      base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
-                      base.params="${DOWNLOADED_PATH}/params.json" \
-                      quantization.use_qat=true \
-                      base.use_lora=16 \
-                      base.preq_mode="preq_8da4w_out_8da8w" \
-                      base.preq_group_size=32 \
-                      base.preq_embedding_quantize=\'8,0\' \
-                      model.use_sdpa_with_kv_cache=true \
-                      model.use_kv_cache=true \
-                      backend.xnnpack.enabled=true \
-                      backend.xnnpack.extended_ops=true \
-                      model.dtype_override=fp32 \
-                      export.max_seq_length=2048 \
-                      export.max_context_length=2048 \
-                      export.output_name="${OUT_ET_MODEL_NAME}.pte" \
-                      base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
-                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
-                    # Original BF16 version, without any quantization
-                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
-                    python -m extension.llm.export.export_llm \
-                      base.model_class="llama3_2" \
-                      base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
-                      base.params="${DOWNLOADED_PATH}/params.json" \
-                      model.use_kv_cache=true \
-                      model.use_sdpa_with_kv_cache=true \
-                      backend.xnnpack.enabled=true \
-                      model.dtype_override=bf16 \
-                      base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
-                      export.output_name="${OUT_ET_MODEL_NAME}.pte"
-                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
-                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
-                    python -m extension.llm.export.export_llm \
-                      base.model_class=llama3_2 \
-                      base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
-                      base.params="${DOWNLOADED_PATH}/params.json" \
-                      model.use_kv_cache=true \
-                      model.use_sdpa_with_kv_cache=true \
-                      model.dtype_override=fp32 \
-                      backend.xnnpack.enabled=true \
-                      backend.xnnpack.extended_ops=true \
-                      quantization.qmode=8da4w \
-                      quantization.group_size=32 \
-                      quantization.embedding_quantize=\'8,0\' \
-                      base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
-                      export.output_name="${OUT_ET_MODEL_NAME}.pte"
-                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
-                    export QNN_SDK_ROOT=/tmp/qnn/2.37.0.250724
-                    export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
-                    export PYTHONPATH=$(pwd)/..
-
-                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
-                    python -m examples.qualcomm.oss_scripts.llama3_2.llama -- \
-                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
-                      --params "${DOWNLOADED_PATH}/params.json" \
-                      --tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \
-                      --compile_only \
-                      --ptq 16a4w \
-                      -m SM8650 \
-                      --model_size 1B \
-                      --model_mode kv \
-                      --prompt "Once"
-
-                    OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
-                    find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
-                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                fi
-            elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
-              if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
-                DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
-                python -m extension.llm.export.export_llm \
-                  base.model_class=qwen3_0_6b \
-                  base.params=examples/models/qwen3/config/0_6b_config.json \
-                  model.use_kv_cache=true \
-                  model.use_sdpa_with_kv_cache=true \
-                  model.dtype_override=fp32 \
-                  backend.xnnpack.enabled=true \
-                  backend.xnnpack.extended_ops=true \
-                  quantization.qmode=8da4w \
-                  quantization.group_size=32 \
-                  quantization.embedding_quantize=\'8,0\' \
-                  base.metadata='"{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}"' \
-                  export.output_name="${OUT_ET_MODEL_NAME}.pte"
-                ls -lh "${OUT_ET_MODEL_NAME}.pte"
-              fi
-            fi
-
-            if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
-              DOWNLOADED_PATH=$(
-                bash .ci/scripts/download_hf_hub.sh \
-                  --model_id "${HF_MODEL_REPO}" \
-                  --files "tokenizer.json"
-              )
-              echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
-
-              # Install optimum-executorch
-              OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-              git clone https://github.com/huggingface/optimum-executorch
-              pushd optimum-executorch
-              # There is no release yet, for CI stability, always test from the same commit on main
-              git checkout $OPTIMUM_ET_COMMIT
-              python install_dev.py --skip_override_torch
-              pip list
-
-              ARGS=(
-                "--model" "${HF_MODEL_REPO}"
-                "--task" "text-generation"
-                "--recipe" "xnnpack"
-                "--use_custom_sdpa"
-                "--use_custom_kv_cache"
-                "--qlinear" "8da4w"
-                "--qembedding" "8w"
-                "--output_dir" ".."
-              )
-
-              optimum-cli export executorch "${ARGS[@]}"
-              popd
-
-              mv model.pte ${OUT_ET_MODEL_NAME}.pte
-              ls -lh "${OUT_ET_MODEL_NAME}.pte"
-            fi
-
-            zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
-            ls -lh model.zip
-            mkdir -p ${ARTIFACTS_DIR_NAME}
-            mv model.zip ${ARTIFACTS_DIR_NAME}
-            ls -lh ${ARTIFACTS_DIR_NAME}
-        elif [[ ${{ matrix.model }} == "llama" ]]; then
-            # Install requirements for export_llama
-            PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
-            # Test llama2
-            if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then
-                DELEGATE_CONFIG="xnnpack+custom+qe"
-            elif [[ ${{ matrix.config }} == *"qnn"* ]]; then
-                DELEGATE_CONFIG="qnn"
-            else
-                echo "Unsupported delegate ${{ matrix.config }}"
-                exit 1
-            fi
-            DTYPE="fp32"
-            PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
-              -model "${{ matrix.model }}" \
-              -build_tool "${BUILD_MODE}" \
-              -dtype "${DTYPE}" \
-              -mode "${DELEGATE_CONFIG}" \
-              -upload "${ARTIFACTS_DIR_NAME}"
-        else
-            PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh \
-              "${{ matrix.model }}" \
-              "${BUILD_MODE}" \
-              "${{ matrix.config }}" \
-              "${ARTIFACTS_DIR_NAME}"
-        fi
-        echo "::endgroup::"
-
-  build-benchmark-app:
-    name: build-benchmark-app
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    needs: set-parameters
-    with:
-      runner: linux.2xlarge
-      docker-image: ci-image:executorch-ubuntu-22.04-clang12-android
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      upload-artifact: android-apps
-      upload-artifact-to-s3: true
-      script: |
-        set -eux
-
-        # Use sccache for NDK compiler as well
-        export CMAKE_CXX_COMPILER_LAUNCHER=sccache
-        export CMAKE_C_COMPILER_LAUNCHER=sccache
-
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
-        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
-
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
-        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
-
-        mkdir -p aar-out
-        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.37.0.250724 EXECUTORCH_ANDROID_PROFILING=ON bash scripts/build_android_library.sh
-        mkdir -p extension/benchmark/android/benchmark/app/libs
-        cp aar-out/executorch.aar extension/benchmark/android/benchmark/app/libs
-        pushd extension/benchmark/android/benchmark
-        ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
-        popd
-        MINIBENCH_APP_DIR="${ARTIFACTS_DIR_NAME}/minibench"
-        mkdir -p "${MINIBENCH_APP_DIR}"
-        cp extension/benchmark/android/benchmark/app/build/outputs/apk/debug/*.apk "${MINIBENCH_APP_DIR}"
-        cp extension/benchmark/android/benchmark/app/build/outputs/apk/androidTest/debug/*.apk "${MINIBENCH_APP_DIR}"
-
-  # Let's see how expensive this job is, we might want to tone it down by running it periodically
-  # CHANGE IF this job name 'benchmark-on-device' changed: extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py
-  benchmark-on-device:
-    if: always()
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
-    needs:
-      - set-parameters
-      - prepare-test-specs
-      - build-benchmark-app
-      - export-models
-    strategy:
-      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
-      fail-fast: false
-    with:
-      # Due to scheduling a job may be pushed beyond the default 60m threshold
-      timeout: 240
-      device-type: android
-      runner: linux.2xlarge
-      test-infra-ref: ''
-      # This is the ARN of ExecuTorch project on AWS
-      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
-      device-pool-arn: ${{ matrix.device_arn }}
-      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
-      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
-      test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/android-llm-device-farm-test-spec.yml
-      new-output-format-flag: true
-
-  upload-benchmark-results:
-    needs:
-      - benchmark-on-device
-    if: always()
-    runs-on: linux.2xlarge
-    environment: upload-benchmark-results
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-
-      - name: Authenticate with AWS
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
-          # The max duration enforced by the server side
-          role-duration-seconds: 18000
-          aws-region: us-east-1
-
-      - name: Setup conda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
-        with:
-          python-version: '3.10'
-
-      - name: Download the list of artifacts from S3
-        env:
-          ARTIFACTS_S3_DIR: s3://gha-artifacts/device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts/
-        shell: bash
-        run: |
-          set -eux
-          ${CONDA_RUN} python -mpip install awscli==1.32.18
-
-          mkdir -p artifacts
-          pushd artifacts
-          ${CONDA_RUN} aws s3 sync "${ARTIFACTS_S3_DIR}" .
-          popd
-
-          ls -lah artifacts
-
-      - name: Download the list of benchmark configs from S3
-        env:
-          BENCHMARK_CONFIGS_DIR: s3://gha-artifacts/${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
-        shell: bash
-        run: |
-          set -eux
-
-          mkdir -p benchmark-configs
-          pushd benchmark-configs
-          ${CONDA_RUN} aws s3 sync "${BENCHMARK_CONFIGS_DIR}" .
-          popd
-
-          ls -lah benchmark-configs
-
-      - name: Extract the benchmark results JSON
-        shell: bash
-        env:
-          DEVICE_TYPE: android
-        run: |
-          set -eux
-
-          mkdir -p benchmark-results
-
-          for ARTIFACTS_BY_JOB in artifacts/*.json; do
-            [ -f "${ARTIFACTS_BY_JOB}" ] || break
-            echo "${ARTIFACTS_BY_JOB}"
-            ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
-              --artifacts "${ARTIFACTS_BY_JOB}" \
-              --output-dir benchmark-results \
-              --app "${DEVICE_TYPE}" \
-              --benchmark-configs benchmark-configs
-          done
-
-          for BENCHMARK_RESULTS in benchmark-results/v3/*.json; do
-            cat "${BENCHMARK_RESULTS}"
-            echo
-          done
-
-      - name: Upload the benchmark results (v3)
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
-        with:
-          benchmark-results-dir: benchmark-results/v3
-          dry-run: false
-          schema-version: v3
-          github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/apple-perf-private-device-experiment.yml b/.github/workflows/apple-perf-private-device-experiment.yml
deleted file mode 100644
index 47e2c6c9340..00000000000
--- a/.github/workflows/apple-perf-private-device-experiment.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-name: apple-perf (private devices)
-
-on:
-  schedule:
-   - cron: 0 0,4,8,12,16,20 * * *
-  pull_request:
-    paths:
-      - .github/workflows/apple-perf-private-device-experiment.yml
-  push:
-    branches:
-      - main
-    paths:
-      - .github/workflows/apple-perf-private-device-experiment.yml
-  # Note: GitHub has an upper limit of 10 inputs
-  workflow_dispatch:
-    inputs:
-      models:
-        description: Models to be benchmarked
-        required: false
-        type: string
-        default: Qwen/Qwen3-0.6B
-      devices:
-        description: Target devices to run benchmark
-        required: false
-        type: string
-        default: apple_iphone_15+pro_private
-      benchmark_configs:
-        description: The list of configs used the benchmark
-        required: false
-        type: string
-  workflow_call:
-    inputs:
-      models:
-        description: Models to be benchmarked
-        required: false
-        type: string
-        default: Qwen/Qwen3-0.6B
-      devices:
-        description: Target devices to run benchmark
-        required: false
-        type: string
-        default: apple_iphone_15+pro_private
-      benchmark_configs:
-        description: The list of configs used the benchmark
-        required: false
-        type: string
-
-concurrency:
-  group: apple-perf-private-devices-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-jobs:
-  apple:
-    uses: ./.github/workflows/apple-perf.yml
-    secrets: inherit
-    permissions:
-      id-token: write
-      contents: read
-    with:
-      models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
-      devices: apple_iphone_15+pro_private
-      benchmark_configs: ${{ inputs.benchmark_configs }}
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
deleted file mode 100644
index 56fc67d1617..00000000000
--- a/.github/workflows/apple-perf.yml
+++ /dev/null
@@ -1,603 +0,0 @@
-name: apple-perf
-
-on:
-  schedule:
-    - cron: 0 1 * * *
-  pull_request:
-    paths:
-      - .github/workflows/apple-perf.yml
-      - .ci/scripts/gather_benchmark_configs.py
-      - extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
-  push:
-    branches:
-      - main
-    paths:
-      - .github/workflows/apple-perf.yml
-      - .ci/scripts/gather_benchmark_configs.py
-      - extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
-  # Note: GitHub has an upper limit of 10 inputs
-  workflow_dispatch:
-    inputs:
-      models:
-        description: Models to be benchmarked
-        required: false
-        type: string
-        default: Qwen/Qwen3-0.6B
-      devices:
-        description: Target devices to run benchmark
-        required: false
-        type: string
-        default: apple_iphone_15+public
-      benchmark_configs:
-        description: The list of configs used the benchmark
-        required: false
-        type: string
-  workflow_call:
-    inputs:
-      models:
-        description: Models to be benchmarked
-        required: false
-        type: string
-        default: Qwen/Qwen3-0.6B
-      devices:
-        description: Target devices to run benchmark
-        required: false
-        type: string
-        default: apple_iphone_15+public
-      benchmark_configs:
-        description: The list of configs used the benchmark
-        required: false
-        type: string
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-jobs:
-  set-parameters:
-    runs-on: ubuntu-22.04
-    outputs:
-      benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: 'false'
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-      - name: Set parameters
-        id: set-parameters
-        shell: bash
-        env:
-          # Separate default values from the workflow dispatch. To ensure defaults are accessible
-          # during scheduled runs and to provide flexibility for different defaults between
-          # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
-          CRON_DEFAULT_DEVICES: apple_iphone_15+public
-        run: |
-          set -eux
-
-          ARGS="--os ios"
-
-          MODELS="${{ inputs.models }}"
-          if [ -z "$MODELS" ]; then
-            MODELS="$CRON_DEFAULT_MODELS"
-          fi
-          ARGS="$ARGS --models $MODELS"
-
-          DEVICES="${{ inputs.devices }}"
-          if [ -z "$DEVICES" ]; then
-            DEVICES="$CRON_DEFAULT_DEVICES"
-          fi
-          ARGS="$ARGS --devices $DEVICES"
-
-          BENCHMARK_CONFIGS="${{ inputs.benchmark_configs }}"
-          if [ -n "$BENCHMARK_CONFIGS" ]; then
-            ARGS="$ARGS --configs $BENCHMARK_CONFIGS"
-          fi
-
-          PYTHONPATH="${PWD}" python .ci/scripts/gather_benchmark_configs.py $ARGS
-
-          echo "benchmark_configs is: ${{ steps.set-parameters.outputs.benchmark_configs }}"
-
-  prepare-test-specs:
-    runs-on: linux.2xlarge
-    needs: set-parameters
-    strategy:
-      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
-      fail-fast: false
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Prepare the spec
-        id: prepare
-        shell: bash
-        env:
-          BENCHMARK_CONFIG: ${{ toJSON(matrix) }}
-        working-directory: extension/benchmark/apple/Benchmark
-        run: |
-          set -eux
-
-          # The model will be exported in the next step to this S3 path
-          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip"
-          # We could write a script to properly use jinja here, but there is only one variable,
-          # so let's just sed it
-          sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' default-ios-device-farm-appium-test-spec.yml.j2
-
-          BENCHMARK_CONFIG_ID=$(echo "${{ matrix.model }}_${{ matrix.config }}" | sed -e 's/[^A-Za-z0-9._-]/_/g')
-          # The config for this benchmark runs, we save it in the test spec so that it can be fetched
-          # later by the upload script
-          sed -i -e 's,{{ benchmark_config_id }},'"${BENCHMARK_CONFIG_ID}"',g' default-ios-device-farm-appium-test-spec.yml.j2
-
-          cp default-ios-device-farm-appium-test-spec.yml.j2 default-ios-device-farm-appium-test-spec.yml
-          # Just print the test spec for debugging
-          cat default-ios-device-farm-appium-test-spec.yml
-
-          # Save the benchmark configs so that we can use it later in the dashboard
-          echo "${BENCHMARK_CONFIG}" > "${BENCHMARK_CONFIG_ID}.json"
-          echo "benchmark-config-id=${BENCHMARK_CONFIG_ID}" >> $GITHUB_OUTPUT
-
-      - name: Upload the spec
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}
-          retention-days: 1
-          if-no-files-found: error
-          path: extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml
-
-      - name: Update the benchmark configs
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
-          retention-days: 1
-          if-no-files-found: error
-          path: extension/benchmark/apple/Benchmark/${{ steps.prepare.outputs.benchmark-config-id }}.json
-
-  export-models:
-    name: export-models
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    needs: set-parameters
-    secrets: inherit
-    strategy:
-      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
-      fail-fast: false
-    with:
-      # NB: Need to use our AWS MacOS runner to upload large models to S3
-      runner: macos-m1-stable
-      python-version: '3.11'
-      submodules: 'recursive'
-      timeout: 60
-      upload-artifact: ios-models
-      upload-artifact-to-s3: true
-      secrets-env: EXECUTORCH_HF_TOKEN
-      script: |
-        set -eux
-
-        echo "::group::Setting up CI environment"
-        .ci/scripts/setup-conda.sh
-
-        BUILD_TOOL=cmake
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-          .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
-
-        if [[ ${{ matrix.config }} == *"coreml"* ]]; then
-          PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-            backends/apple/coreml/scripts/install_requirements.sh
-        fi
-
-        # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
-
-        pip install -U "huggingface_hub[cli]"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-        ${CONDA_RUN} pip install accelerate sentencepiece
-        pip list
-
-        ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.config }}
-        echo "::endgroup::"
-
-        echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}"
-        BUILD_MODE="cmake"
-
-        if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
-          # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
-          HF_MODEL_REPO=${{ matrix.model }}
-          OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
-
-          # Convert HF checkpoint to ET via etLLM path
-          if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
-            # The benchmark app replies on the _llm suffix to determine whether the model is a LLM or not
-            OUT_ET_MODEL_NAME=${OUT_ET_MODEL_NAME}_llm
-            # Llama models on Hugging Face
-            if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
-              # SpinQuant
-              # Download prequantized chceckpoint from Hugging Face
-              DOWNLOADED_PATH=$(
-                bash .ci/scripts/download_hf_hub.sh \
-                  --model_id "${HF_MODEL_REPO}" \
-                  --files "tokenizer.model" "params.json" "consolidated.00.pth"
-              )
-              # Export using ExecuTorch's model definition
-              ${CONDA_RUN} python -m extension.llm.export.export_llm \
-                base.model_class="llama3_2" \
-                base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
-                base.params="${DOWNLOADED_PATH}/params.json" \
-                model.use_sdpa_with_kv_cache=true \
-                backend.xnnpack.enabled=true \
-                backend.xnnpack.extended_ops=true \
-                base.preq_mode="preq_8da4w_out_8da8w" \
-                base.preq_group_size=32 \
-                export.max_seq_length=2048 \
-                export.max_context_length=2048 \
-                export.output_name="${OUT_ET_MODEL_NAME}.pte" \
-                model.use_kv_cache=true \
-                model.dtype_override=fp32 \
-                base.preq_embedding_quantize=\'8,0\' \
-                quantization.use_spin_quant=native \
-                base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
-              ls -lh "${OUT_ET_MODEL_NAME}.pte"
-            elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
-              # QAT + LoRA
-              # Download prequantized chceckpoint from Hugging Face
-              DOWNLOADED_PATH=$(
-                bash .ci/scripts/download_hf_hub.sh \
-                  --model_id "${HF_MODEL_REPO}" \
-                  --files "tokenizer.model" "params.json" "consolidated.00.pth"
-              )
-              # Export using ExecuTorch's model definition
-              ${CONDA_RUN} python -m extension.llm.export.export_llm \
-                base.model_class="llama3_2" \
-                base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
-                base.params="${DOWNLOADED_PATH}/params.json" \
-                quantization.use_qat=true \
-                base.use_lora=16 \
-                base.preq_mode="preq_8da4w_out_8da8w" \
-                base.preq_group_size=32 \
-                base.preq_embedding_quantize=\'8,0\' \
-                model.use_sdpa_with_kv_cache=true \
-                model.use_kv_cache=true \
-                backend.xnnpack.enabled=true \
-                backend.xnnpack.extended_ops=true \
-                model.dtype_override=fp32 \
-                export.max_seq_length=2048 \
-                export.max_context_length=2048 \
-                export.output_name="${OUT_ET_MODEL_NAME}.pte" \
-                base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"'
-              ls -lh "${OUT_ET_MODEL_NAME}.pte"
-            elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
-              # Original BF16 version, without any quantization
-              DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
-              ${CONDA_RUN} python -m extension.llm.export.export_llm \
-                base.model_class="llama3_2" \
-                base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
-                base.params="${DOWNLOADED_PATH}/params.json" \
-                model.use_kv_cache=true \
-                model.use_sdpa_with_kv_cache=true \
-                backend.xnnpack.enabled=true \
-                model.dtype_override=bf16 \
-                base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
-                export.output_name="${OUT_ET_MODEL_NAME}.pte"
-              ls -lh "${OUT_ET_MODEL_NAME}.pte"
-            elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
-              DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
-              ${CONDA_RUN} python -m extension.llm.export.export_llm \
-                base.model_class=llama3_2 \
-                base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
-                base.params="${DOWNLOADED_PATH}/params.json" \
-                model.use_kv_cache=true \
-                model.use_sdpa_with_kv_cache=true \
-                model.dtype_override=fp32 \
-                backend.xnnpack.enabled=true \
-                backend.xnnpack.extended_ops=true \
-                quantization.qmode=8da4w \
-                quantization.group_size=32 \
-                quantization.embedding_quantize=\'8,0\' \
-                base.metadata='"{\"get_bos_id\":128000,\"get_eos_ids\":[128009,128001]}"' \
-                export.output_name="${OUT_ET_MODEL_NAME}.pte"
-              ls -lh "${OUT_ET_MODEL_NAME}.pte"
-            elif [[ ${{ matrix.config }} == "llama3_coreml_ane" ]]; then
-              # ANE
-              DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
-              ${CONDA_RUN} python -m extension.llm.export.export_llm \
-                base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
-                base.params="${DOWNLOADED_PATH}/params.json" \
-                quantization.embedding_quantize=\'4,32\' \
-                model.use_kv_cache=true \
-                model.enable_dynamic_shape=false \
-                backend.coreml.enabled=true \
-                backend.coreml.ios=18 \
-                backend.coreml.quantize=c4w \
-                backend.coreml.compute_units=cpu_and_ne \
-                export.output_name="${OUT_ET_MODEL_NAME}.pte"
-              ls -lh "${OUT_ET_MODEL_NAME}.pte"
-            fi
-          elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
-            OUT_ET_MODEL_NAME=${OUT_ET_MODEL_NAME}_llm
-            if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
-                DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
-                ${CONDA_RUN} python -m extension.llm.export.export_llm \
-                  base.model_class=qwen3_0_6b \
-                  base.params=examples/models/qwen3/config/0_6b_config.json \
-                  model.use_kv_cache=true \
-                  model.use_sdpa_with_kv_cache=true \
-                  model.dtype_override=fp32 \
-                  backend.xnnpack.enabled=true \
-                  backend.xnnpack.extended_ops=true \
-                  quantization.qmode=8da4w \
-                  quantization.group_size=32 \
-                  quantization.embedding_quantize=\'8,0\' \
-                  base.metadata='"{\"get_bos_id\":151644,\"get_eos_ids\":[151645]}"' \
-                  export.output_name="${OUT_ET_MODEL_NAME}.pte"
-                ls -lh "${OUT_ET_MODEL_NAME}.pte"
-            fi
-          fi
-
-          if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
-            DOWNLOADED_PATH=$(
-              bash .ci/scripts/download_hf_hub.sh \
-                --model_id "${HF_MODEL_REPO}" \
-                --files "tokenizer.json"
-            )
-            echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
-
-            # Install optimum-executorch
-            OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-            git clone https://github.com/huggingface/optimum-executorch
-            pushd optimum-executorch
-            # There is no release yet, for CI stability, always test from the same commit on main
-            git checkout $OPTIMUM_ET_COMMIT
-            ${CONDA_RUN} python install_dev.py --skip_override_torch
-            pip list
-
-            ARGS=(
-              "--model" "${HF_MODEL_REPO}"
-              "--task" "text-generation"
-              "--recipe" "xnnpack"
-              "--use_custom_sdpa"
-              "--use_custom_kv_cache"
-              "--qlinear" "8da4w"
-              "--qembedding" "8w"
-              "--output_dir" ".."
-            )
-
-            ${CONDA_RUN} optimum-cli export executorch "${ARGS[@]}"
-            popd
-
-            # The benchmark app replies on the _llm suffix to determine whether the model is a LLM or not
-            OUT_ET_MODEL_NAME=${OUT_ET_MODEL_NAME}_llm
-            mv model.pte ${OUT_ET_MODEL_NAME}.pte
-            ls -lh "${OUT_ET_MODEL_NAME}.pte"
-          fi
-
-          zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
-          ls -lh model.zip
-          mkdir -p "${ARTIFACTS_DIR_NAME}"
-          mv model.zip "${ARTIFACTS_DIR_NAME}"
-        elif [[ ${{ matrix.model }} == "llama" ]]; then
-          # Install requirements for export_llama
-          PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-            bash examples/models/llama/install_requirements.sh
-
-          # Test llama2
-          if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then
-            DELEGATE_CONFIG="xnnpack+custom+qe"
-          elif [[ ${{ matrix.config }} == *"coreml"* ]]; then
-            DELEGATE_CONFIG="coreml"
-          elif [[ ${{ matrix.config }} == *"mps"* ]]; then
-            DELEGATE_CONFIG="mps"
-          fi
-          DTYPE="fp32"
-          PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-            bash .ci/scripts/test_llama.sh \
-              -model "stories110M" \
-              -build_tool "${BUILD_MODE}" \
-              -dtype "${DTYPE}" \
-              -mode "${DELEGATE_CONFIG}" \
-              -upload "${ARTIFACTS_DIR_NAME}"
-        else
-          PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-            bash .ci/scripts/test_model.sh \
-              "${{ matrix.model }}" \
-              "${BUILD_MODE}" \
-              "${{ matrix.config }}" \
-              "${ARTIFACTS_DIR_NAME}"
-        fi
-        echo "::endgroup::"
-
-  build-benchmark-app:
-    name: build-benchmark-app
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    needs:
-      - set-parameters
-    secrets: inherit
-    with:
-      runner: macos-14-xlarge
-      python-version: '3.11'
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      upload-artifact: ios-apps
-      secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
-      timeout: 90
-      script: |
-        set -eux
-
-        echo "::group::Setting up CI environment"
-        .ci/scripts/setup-conda.sh
-
-        BUILD_TOOL=cmake
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
-        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
-
-        # Setup Apple certificate for iOS development
-        BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64}" \
-        BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \
-        KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \
-        .ci/scripts/setup-ios.sh
-
-        # Install CoreML Backend Requirements
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-          backends/apple/coreml/scripts/install_requirements.sh
-        echo "::endgroup::"
-
-        echo "::group::Build ExecuTorch iOS frameworks"
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output scripts/build_apple_frameworks.sh
-        echo "::endgroup::"
-
-        # NB: Although exported models can be copied to this directory and bundled together with the
-        # app, we don't use this in CI and rely on AWS extra data parameter to make the model and the
-        # tokenizer available to the benchmark. This decouples the app and the model. We just need to
-        # create the directory here to pass the build
-        mkdir -p extension/benchmark/apple/Benchmark/Models
-        ${CONDA_RUN} --no-capture-output \
-          scripts/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME}
-
-  upload-benchmark-app:
-    needs: build-benchmark-app
-    runs-on: linux.2xlarge
-    steps:
-      - name: Download the apps from GitHub
-        uses: actions/download-artifact@v4
-        with:
-          # The name here needs to match the name of the upload-artifact parameter
-          name: ios-apps
-          path: ${{ runner.temp }}/artifacts/
-
-      - name: Verify the apps
-        shell: bash
-        working-directory: ${{ runner.temp }}/artifacts/
-        run: |
-          ls -lah ./
-
-      - name: Upload the apps to S3
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: gha-artifacts
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/artifacts
-          retention-days: 14
-          if-no-files-found: ignore
-          path: ${{ runner.temp }}/artifacts/
-
-  # CHANGE IF this job name 'benchmark-on-device' changed: extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py
-  benchmark-on-device:
-    if: always()
-    needs:
-      - set-parameters
-      - prepare-test-specs
-      - upload-benchmark-app
-      - export-models
-    permissions:
-      id-token: write
-      contents: read
-    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
-    strategy:
-      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
-      fail-fast: false
-    with:
-      # Due to scheduling a job may be pushed beyond the default 60m threshold
-      timeout: 120
-      device-type: ios
-      # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS
-      runner: linux.2xlarge
-      test-infra-ref: ''
-      # This is the ARN of ExecuTorch project on AWS
-      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
-      device-pool-arn: ${{ matrix.device_arn }}
-      # Uploaded to S3 from the previous job
-      ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa
-      ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip
-      test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/default-ios-device-farm-appium-test-spec.yml
-      new-output-format-flag: true
-
-  upload-benchmark-results:
-    needs:
-      - benchmark-on-device
-    if: always()
-    runs-on: linux.2xlarge
-    environment: upload-benchmark-results
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-
-      - name: Authenticate with AWS
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
-          # The max duration enforced by the server side
-          role-duration-seconds: 18000
-          aws-region: us-east-1
-
-      - name: Setup conda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
-        with:
-          python-version: '3.10'
-
-      - name: Download the list of artifacts from S3
-        env:
-          ARTIFACTS_S3_DIR: s3://gha-artifacts/device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts/
-        shell: bash
-        run: |
-          set -eux
-          ${CONDA_RUN} python -mpip install awscli==1.32.18
-
-          mkdir -p artifacts
-          pushd artifacts
-          ${CONDA_RUN} aws s3 sync "${ARTIFACTS_S3_DIR}" .
-          popd
-
-          ls -lah artifacts
-
-      - name: Download the list of benchmark configs from S3
-        env:
-          BENCHMARK_CONFIGS_DIR: s3://gha-artifacts/${{ github.repository }}/${{ github.run_id }}/artifacts/benchmark-configs/
-        shell: bash
-        run: |
-          set -eux
-          mkdir -p benchmark-configs
-          pushd benchmark-configs
-          ${CONDA_RUN} aws s3 sync "${BENCHMARK_CONFIGS_DIR}" .
-          popd
-          ls -lah benchmark-configs
-
-      - name: Extract the benchmark results JSON
-        shell: bash
-        env:
-          DEVICE_TYPE: ios
-        run: |
-          set -eux
-
-          mkdir -p benchmark-results
-
-          for ARTIFACTS_BY_JOB in artifacts/*.json; do
-            [ -f "${ARTIFACTS_BY_JOB}" ] || break
-            echo "${ARTIFACTS_BY_JOB}"
-            ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
-              --artifacts "${ARTIFACTS_BY_JOB}" \
-              --output-dir benchmark-results \
-              --app "${DEVICE_TYPE}" \
-              --benchmark-configs benchmark-configs
-          done
-
-          for BENCHMARK_RESULTS in benchmark-results/v3/*.json; do
-            cat "${BENCHMARK_RESULTS}"
-            echo
-          done
-
-      - name: Upload the benchmark results (v3)
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
-        with:
-          benchmark-results-dir: benchmark-results/v3
-          dry-run: false
-          schema-version: v3
-          github-token: ${{ secrets.GITHUB_TOKEN }}