pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 2 additions & 0 deletions b/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.ci/scripts/gather_benchmark_configs.py‎
Lines changed: 7 additions & 5 deletions b/‎.ci/scripts/gather_benchmark_configs.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎.ci/scripts/test_llama.sh‎
Lines changed: 1 addition & 2 deletions b/‎.ci/scripts/test_llama.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.ci/scripts/test_llama_torchao_lowbit.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_llama_torchao_lowbit.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_llava.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_llava.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_phi_3_mini.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_phi_3_mini.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/unittest-buck2.sh‎
Lines changed: 8 additions & 4 deletions b/‎.ci/scripts/unittest-buck2.sh‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 82 additions & 13 deletions b/‎.github/workflows/android-perf.yml‎
Lines changed: 82 additions & 13 deletions
diff --git a/‎.github/workflows/apple-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/apple-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions
@@ -32,6 +32,8 @@ set_up_aot() {
       -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
       -DEXECUTORCH_BUILD_DEVTOOLS=ON \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -DPYTHON_EXECUTABLE=python3
 
@@ -32,7 +32,8 @@
 BENCHMARK_CONFIGS = {
     "xplat": [
         "xnnpack_q8",
-        "hf_xnnpack_fp32",
+        "hf_xnnpack_custom_spda_kv_cache_8da4w",
+        "et_xnnpack_custom_spda_kv_cache_8da4w",
         "llama3_fb16",
         "llama3_spinquant",
         "llama3_qlora",
@@ -129,25 +130,26 @@ def generate_compatible_configs(model_name: str, target_os=None) -> List[str]:
     """
     configs = []
     if is_valid_huggingface_model_id(model_name):
+        configs.append("hf_xnnpack_custom_spda_kv_cache_8da4w")
         if model_name.startswith("meta-llama/"):
-            # LLaMA models
+            # etLLM recipes for Llama
             repo_name = model_name.split("meta-llama/")[1]
             if "qlora" in repo_name.lower():
                 configs.append("llama3_qlora")
             elif "spinquant" in repo_name.lower():
                 configs.append("llama3_spinquant")
             else:
                 configs.append("llama3_fb16")
+                configs.append("et_xnnpack_custom_spda_kv_cache_8da4w")
                 configs.extend(
                     [
                         config
                         for config in BENCHMARK_CONFIGS.get(target_os, [])
                         if config.startswith("llama")
                     ]
                 )
-        else:
-            # Non-LLaMA models
-            configs.append("hf_xnnpack_fp32")
+        if model_name.startswith("Qwen/Qwen3"):
+            configs.append("et_xnnpack_custom_spda_kv_cache_8da4w")
     elif model_name in MODEL_NAME_TO_MODEL:
         # ExecuTorch in-tree non-GenAI models
         configs.append("xnnpack_q8")
 
@@ -156,8 +156,7 @@ cmake_install_executorch_libraries() {
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_QNN="$QNN" \
-        -DQNN_SDK_ROOT="$QNN_SDK_ROOT" \
-        -Bcmake-out .
+        -DQNN_SDK_ROOT="$QNN_SDK_ROOT"
     cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
 }
 
 
@@ -30,6 +30,7 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
 
@@ -37,6 +37,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DEXECUTORCH_ENABLE_LOGGING=ON              \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON      \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON        \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON     \
 
@@ -27,6 +27,7 @@ cmake_install_executorch_libraries() {
       -DEXECUTORCH_ENABLE_LOGGING=1 \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
       -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_BUILD_XNNPACK=ON \
 
@@ -15,7 +15,7 @@ buck2 query "//backends/apple/... + //backends/example/... + \
 //kernels/optimized/... + //kernels/portable/... + //kernels/quantized/... + \
 //kernels/test/... + //runtime/... + //schema/... + //test/... + //util/..."
 
-UNBUILDABLE_OPTIMIZED_OPS_REGEX="gelu|fft_r2c|log_softmax"
+UNBUILDABLE_OPTIMIZED_OPS_REGEX="_elu|gelu|fft|log_softmax"
 BUILDABLE_OPTIMIZED_OPS=$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
 
 # TODO: build prim_ops_test_cpp again once supported_features works in
@@ -24,6 +24,10 @@ BUILDABLE_KERNELS_PRIM_OPS_TARGETS=$(buck2 query //kernels/prim_ops/... | grep -
 # TODO: expand the covered scope of Buck targets.
 # //runtime/kernel/... is failing because //third-party:torchgen_files's shell script can't find python on PATH.
 # //runtime/test/... requires Python torch, which we don't have in our OSS buck setup.
-buck2 test $BUILDABLE_OPTIMIZED_OPS //kernels/portable/... \
-      $BUILDABLE_KERNELS_PRIM_OPS_TARGETS //runtime/backend/... //runtime/core/... \
-      //runtime/executor: //runtime/kernel/... //runtime/platform/...
+for op in "build" "test"; do
+    buck2 $op $BUILDABLE_OPTIMIZED_OPS \
+          //examples/selective_build:select_all_dtype_selective_lib_portable_lib \
+          //kernels/portable/... \
+          $BUILDABLE_KERNELS_PRIM_OPS_TARGETS //runtime/backend/... //runtime/core/... \
+          //runtime/executor: //runtime/kernel/... //runtime/platform/...
+done
@@ -18,7 +18,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+        default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
       devices:
         description: Target devices to run benchmark
         required: false
@@ -34,7 +34,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+        default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
       devices:
         description: Target devices to run benchmark
         required: false
@@ -57,6 +57,6 @@ jobs:
       id-token: write
       contents: read
     with:
-      models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
+      models: ${{ inputs.models || 'Qwen/Qwen3-0.6B' }}
       devices: samsung_galaxy_s22_private
       benchmark_configs: ${{ inputs.benchmark_configs }}
@@ -70,7 +70,7 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'llama' }}
           CRON_DEFAULT_DEVICES: samsung_galaxy_s22
         run: |
           set -eux
@@ -201,8 +201,8 @@ jobs:
             HF_MODEL_REPO=${{ matrix.model }}
             OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
 
+            # Convert HF checkpoint to ET via etLLM path
             if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
-                # Llama models on Hugging Face
                 if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
                     # SpinQuant
                     # Download prequantized chceckpoint from Hugging Face
@@ -272,6 +272,21 @@ jobs:
                       --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
                       --output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+                    python -m examples.models.llama.export_llama \
+                      --model llama3_2 \
+                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      --params "${DOWNLOADED_PATH}/params.json" \
+                      -kv \
+                      --use_sdpa_with_kv_cache \
+                      -d fp32 \
+                      -X \
+                      --xnnpack-extended-ops \
+                      -qmode 8da4w -G 32 -E 8,0 \
+                      --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+                      --output_name="${OUT_ET_MODEL_NAME}.pte"
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
                     export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
                     export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
@@ -292,21 +307,75 @@ jobs:
                     OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
                     find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
-                else
-                    # By default, test with the Hugging Face model and the xnnpack recipe
-                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
-                    python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
-                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 fi
-            else
-                echo "Unsupported model ${{ matrix.model }}"
-                exit 1
+            elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
+              if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+                DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
+                python -m examples.models.llama.export_llama \
+                  --model qwen3-0_6b \
+                  --params examples/models/qwen3/0_6b_config.json \
+                  -kv \
+                  --use_sdpa_with_kv_cache \
+                  -d fp32 \
+                  -X \
+                  --xnnpack-extended-ops \
+                  -qmode 8da4w \
+                  -G 32 \
+                  -E 8,0 \
+                  --metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
+                  --output_name="${OUT_ET_MODEL_NAME}.pte"
+                ls -lh "${OUT_ET_MODEL_NAME}.pte"
+              fi
+            fi
+
+            if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+              DOWNLOADED_PATH=$(
+                bash .ci/scripts/download_hf_hub.sh \
+                  --model_id "${HF_MODEL_REPO}" \
+                  --files "tokenizer.json"
+              )
+              echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
+
+              # Install optimum-executorch
+              git clone https://github.com/huggingface/optimum-executorch
+              pushd optimum-executorch
+              # There is no release yet, for CI stability, always test from the same commit on main
+              git checkout 1c653dc49812fc431a22312c7295d97005d22e12
+              python install_dev.py
+              pip list
+
+              ARGS=(
+                "--model" "${HF_MODEL_REPO}"
+                "--task" "text-generation"
+                "--recipe" "xnnpack"
+                "--use_custom_sdpa"
+                "--qlinear"
+                "--qembedding"
+                "--output_dir" ".."
+              )
+
+              # Add conditional arguments based on model
+              case "${HF_MODEL_REPO}" in
+                *"google/gemma-3-1b-it"*)
+                  echo "--use_custom_kv_cache can not be used for HybridCache"
+                  ;;
+                *)
+                  ARGS+=("--use_custom_kv_cache")
+                  ;;
+              esac
+
+              optimum-cli export executorch "${ARGS[@]}"
+              popd
+
+              mv model.pte ${OUT_ET_MODEL_NAME}.pte
+              ls -lh "${OUT_ET_MODEL_NAME}.pte"
             fi
 
-            zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
+            zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
             ls -lh model.zip
-            mkdir -p "${ARTIFACTS_DIR_NAME}"
-            mv model.zip "${ARTIFACTS_DIR_NAME}"
+            mkdir -p ${ARTIFACTS_DIR_NAME}
+            mv model.zip ${ARTIFACTS_DIR_NAME}
+            ls -lh ${ARTIFACTS_DIR_NAME}
         elif [[ ${{ matrix.model }} == "llama" ]]; then
             # Install requirements for export_llama
             PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
 
@@ -18,7 +18,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+        default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
       devices:
         description: Target devices to run benchmark
         required: false
@@ -34,7 +34,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+        default: Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
       devices:
         description: Target devices to run benchmark
         required: false
@@ -57,6 +57,6 @@ jobs:
       id-token: write
       contents: read
     with:
-      models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
+      models: ${{ inputs.models || 'Qwen/Qwen3-0.6B' }}
       devices: apple_iphone_15_private
       benchmark_configs: ${{ inputs.benchmark_configs }}
Original file line number	Diff line number	Diff line change
`@@ -156,8 +156,7 @@ cmake_install_executorch_libraries() {`
`156`	`156`	`-DCMAKE_INSTALL_PREFIX=cmake-out \`
`157`	`157`	`-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \`
`158`	`158`	`-DEXECUTORCH_BUILD_QNN="$QNN" \`
`159`		`- -DQNN_SDK_ROOT="$QNN_SDK_ROOT" \`
`160`		`- -Bcmake-out .`
	`159`	`+ -DQNN_SDK_ROOT="$QNN_SDK_ROOT"`
`161`	`160`	`cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"`
`162`	`161`	`}`
`163`	`162`