From ef3b9ed5719ede8cdd6c211e3f629373634ebeb2 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 13 Aug 2025 17:34:43 -0700
Subject: [PATCH 1/7] Improve optimum coverage in ET (more models, xnnpack on
 mac)

---
 .ci/scripts/test_huggingface_optimum_model.py | 119 +++++++++++++++---
 .github/workflows/trunk.yml                   | 110 +++++++---------
 2 files changed, 150 insertions(+), 79 deletions(-)

diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py
index 6a31eabb0c8..589d49ef475 100644
--- a/.ci/scripts/test_huggingface_optimum_model.py
+++ b/.ci/scripts/test_huggingface_optimum_model.py
@@ -1,7 +1,11 @@
 import argparse
+import gc
+import logging
+import math
 import subprocess
 import tempfile
 from pathlib import Path
+from typing import List
 
 import torch
 from datasets import load_dataset
@@ -15,6 +19,7 @@
 )
 from transformers import (
     AutoConfig,
+    AutoModelForCausalLM,
     AutoModelForImageClassification,
     AutoProcessor,
     AutoTokenizer,
@@ -37,6 +42,56 @@ def cli_export(command, model_dir):
         print(f"Export failed with error: {e}")
 
 
+def check_causal_lm_output_quality(
+    model_id: str, generated_tokens: List[int], max_perplexity_threshold: float = 100.0
+):
+    """
+    Evaluates the quality of text generated by a causal language model by calculating its perplexity.
+
+    Args:
+        model_id: HuggingFace model identifier (e.g., "google/gemma2-2b")
+        generated_tokens: The tokens generated by the exported model to evaluate
+        max_perplexity_threshold: Maximum acceptable perplexity (lower is better)
+
+    Returns:
+        tuple: (is_quality_ok, reason) with boolean result and explanation
+    """
+    logging.info(f"Starting perplexity check with model '{model_id}' ...")
+    # Load model
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        low_cpu_mem_usage=True,
+        use_cache=False,
+        torch_dtype=torch.bfloat16,
+    )
+
+    with torch.no_grad():
+        outputs = model(input_ids=generated_tokens, labels=generated_tokens)
+
+    # Get the loss (negative log-likelihood)
+    loss = outputs.loss.item()
+
+    # Calculate perplexity (exp of the average negative log-likelihood)
+    perplexity = math.exp(loss)
+
+    is_quality_ok = perplexity <= max_perplexity_threshold
+    if is_quality_ok:
+        logging.info(
+            f"✓ Perplexity check passed: {perplexity:.2f} <= {max_perplexity_threshold}"
+        )
+    else:
+        logging.warning(
+            f"✗ Perplexity check failed: {perplexity:.2f} > {max_perplexity_threshold}"
+        )
+
+    # Clean up immediately
+    del model
+    del outputs
+    gc.collect()
+
+    return is_quality_ok
+
+
 def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only=False):
     command = [
         "optimum-cli",
@@ -51,7 +106,15 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
         "--output_dir",
         model_dir,
     ]
-    if "coreml" in recipe:
+    if "xnnpack" in recipe:
+        if quantize:
+            command += [
+                "--qlinear",
+                "8da4w",
+                "--qembedding",
+                "8w",
+            ]
+    elif "coreml" in recipe:
         command += [
             "--disable_dynamic_shapes",
         ]
@@ -63,7 +126,9 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
                 "8w",
             ]
     else:
-        assert not quantize, "Quantization is not supported for non-CoreML recipes yet"
+        assert (
+            not quantize
+        ), "Quantization is only supported for XnnPack and CoreML recipes at the moment."
 
     if not run_only:
         cli_export(command, model_dir)
@@ -77,6 +142,14 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
         max_seq_len=64,
     )
     print(f"\nGenerated text:\n\t{generated_text}")
+    generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+    # Free memory before loading eager for quality check
+    del model
+    del tokenizer
+    gc.collect()
+
+    assert check_causal_lm_output_quality(model_id, generated_tokens) is True
 
 
 def test_fill_mask(model_id, model_dir, recipe, *, quantize=True, run_only=False):
@@ -278,23 +351,39 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
     )
     args = parser.parse_args()
 
-    model_to_model_id_and_test_function = {
-        "smollm": ("HuggingFaceTB/SmolLM2-135M", test_text_generation),  # works
-        "qwen3": ("Qwen/Qwen3-0.6B", test_text_generation),  # works
-        "olmo": ("allenai/OLMo-1B-hf", test_text_generation),  # works
-        "gemma3": ("unsloth/gemma-3-1b-it", test_text_generation),  # does not export
-        "phi4": (
+    _text_generation_mapping = {
+        "llama3.2-1b": ("NousResearch/Llama-3.2-1B", test_text_generation),
+        "qwen3-0.6b": ("Qwen/Qwen3-0.6B", test_text_generation),
+        "qwen3-1.7b": ("Qwen/Qwen3-1.7B", test_text_generation),
+        "gemma3-1b": (
+            "unsloth/gemma-3-1b-it",
+            test_text_generation,
+        ),  # does not export for CoreML
+        "phi4-mini": (
             "microsoft/Phi-4-mini-instruct",
             test_text_generation,
-        ),  # fails to lower
-        "llama3": ("NousResearch/Llama-3.2-1B", test_text_generation),  # works
-        "bert": ("google-bert/bert-base-uncased", test_fill_mask),  # works
-        "roberta": ("FacebookAI/xlmcl-roberta-base", test_fill_mask),  # works
-        "distilbert": ("distilbert/distilbert-base-uncased", test_fill_mask),  # works
-        "whisper": ("openai/whisper-tiny", test_whisper),  # works
+        ),  # fails to lower for CoreML
+        "smollm2-135m": ("HuggingFaceTB/SmolLM2-135M", test_text_generation),
+        "smollm3-3b": ("HuggingFaceTB/SmolLM3-3B", test_text_generation),
+        "olmo": ("allenai/OLMo-1B-hf", test_text_generation),
+    }
+
+    _mask_fill_mapping = {
+        "bert": ("google-bert/bert-base-uncased", test_fill_mask),
+        "roberta": ("FacebookAI/xlmcl-roberta-base", test_fill_mask),
+        "distilbert": ("distilbert/distilbert-base-uncased", test_fill_mask),
+    }
+
+    _misc_model_mapping = {
+        "whisper": ("openai/whisper-tiny", test_whisper),
         "t5": ("google-t5/t5-small", test_t5),  # CoreML runime failure
-        "vit": ("google/vit-base-patch16-224", test_vit),  # works
+        "vit": ("google/vit-base-patch16-224", test_vit),
     }
+
+    model_to_model_id_and_test_function = (
+        _text_generation_mapping | _mask_fill_mapping | _misc_model_mapping
+    )
+
     if args.model not in model_to_model_id_and_test_function:
         raise ValueError(
             f"Unknown model name: {args.model}. Available models: {model_to_model_id_and_test_function.keys()}"
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 34a955b88a9..12225c971c6 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -732,24 +732,29 @@ jobs:
           echo "::endgroup::"
         done
 
-  test-huggingface-transformers:
+  test-huggingface-transformers-xnnpack:
     # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
     if: ${{ !github.event.pull_request.head.repo.fork }}
-    name: test-huggingface-transformers
+    name: test-huggingface-transformers-xnnpack
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     secrets: inherit
+    # Models below selected based on https://huggingface.co/models?pipeline_tag=text-generation&num_parameters=min:0,max:3B&sort=trending.
+    xnnpack: &xnnpack
+      - llama3.2-1b|xnnpack|--quantize
+      - qwen3-0.6b|xnnpack|--quantize
+      - qwen3-1.7b|xnnpack|--quantize
+      - gemma3-1b|xnnpack|--quantize
+      - phi-4-mini|xnnpack|--quantize
+      - smollm2-135m|xnnpack|--quantize
+      - smollm3-3b|xnnpack|--quantize
     strategy:
       matrix:
-        hf_model_id: [
-          google/gemma-3-1b-it,
-          Qwen/Qwen3-0.6B,
-          HuggingFaceTB/SmolLM2-135M,
-          meta-llama/Llama-3.2-1B,
-          allenai/OLMo-1B-hf,
-        ]
+        config:
+          - *xnnpack
+          - *coreml
       fail-fast: false
     with:
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -760,6 +765,12 @@ jobs:
       timeout: 90
       upload-artifact: profiling-artifacts-${{ strategy.job-index }}
       script: |
+        set -eux
+        IFS='|' read -r MODEL RECIPE QUANTIZE <<< "${{ matrix.config }}"
+        echo "Model: $MODEL"
+        echo "Recipe: $RECIPE"
+        echo "Quantize: $QUANTIZE"
+
         echo "::group::Set up ExecuTorch"
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -797,51 +808,11 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export to ExecuTorch"
-        # Pass matrix variable as environment variable
-        export MODEL_ID="${{ matrix.hf_model_id }}"
-        export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_kv_cache_8da4w"
-        pushd optimum-executorch
-
-        ARGS=(
-          "--model" "${MODEL_ID}"
-          "--task" "text-generation"
-          "--recipe" "xnnpack"
-          "--use_custom_sdpa"
-          "--use_custom_kv_cache"
-          "--qlinear" "8da4w"
-          "--qembedding" "8w"
-          "--output_dir" "${OUTPUT_DIR}"
-        )
-
-        optimum-cli export executorch "${ARGS[@]}"
-
-        ls -FlAGhp ${OUTPUT_DIR}
-        popd
-        echo "::endgroup::"
-
-        echo "::group::Inference using python API"
-        pushd optimum-executorch
-        python -c "
-        import os
-        from optimum.executorch import ExecuTorchModelForCausalLM
-        from transformers import AutoTokenizer
-
-        model_id = os.getenv('MODEL_ID')
-        pte_dir = os.getenv('OUTPUT_DIR')
-        print(f'Loading model {model_id} from {pte_dir}.')
-        model = ExecuTorchModelForCausalLM.from_pretrained(pte_dir)
-        generated_text = model.text_generation(
-          tokenizer=AutoTokenizer.from_pretrained(model_id),
-          prompt='Simply put, the theory of relativity states that',
-          max_seq_len=64
-        )
-        print(generated_text)
-        "
-        popd
+        echo "::group::Run tests"
+        python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE}
         echo "::endgroup::"
 
-        echo "::group::Inference using executor_runner with ETDump"
+        echo "::group::Generate artifacts for performance profiling"
         ./cmake-out/executor_runner \
           --model_path ${OUTPUT_DIR}/model.pte \
           --etdump_path ${OUTPUT_DIR}/etdump.etdp
@@ -851,29 +822,40 @@ jobs:
         python3 -m devtools.inspector.inspector_cli \
           --etdump_path ${OUTPUT_DIR}/etdump.etdp \
           --tsv_path ${TSV_PATH}
-
         echo "::endgroup::"
 
-  test-huggingface-optimum-coreml:
+  test-huggingface-transformers-coreml:
     # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
     if: ${{ !github.event.pull_request.head.repo.fork }}
-    name: test-huggingface-optimum-coreml
+    name: test-huggingface-transformers-coreml
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     permissions:
       id-token: write
       contents: read
     secrets: inherit
+    # Models below selected based on https://huggingface.co/models?pipeline_tag=text-generation&num_parameters=min:0,max:3B&sort=trending.
+    xnnpack: &xnnpack
+      - llama3.2-1b|xnnpack|--quantize
+      - qwen3-0.6b|xnnpack|--quantize
+      - qwen3-1.7b|xnnpack|--quantize
+      - gemma3-1b|xnnpack|--quantize
+      - phi-4-mini|xnnpack|--quantize
+      - smollm2-135m|xnnpack|--quantize
+      - smollm3-3b|xnnpack|--quantize
+    coreml: &coreml
+      - llama3.2-1b|coreml_fp32_gpu|--quantize,
+      - qwen3-0.6b|coreml_fp32_gpu|--quantize,
+      - qwen3-0.6b|coreml_fp32_gpu|--quantize,
+      - smollm2-135m|coreml_fp32_gpu|--quantize,
+      - olmo-1b|coreml_fp32_gpu|--quantize,
+      # - roberta|coreml_fp32_gpu|--quantize, roberta requires special HF access
+      - bert|coreml_fp32_gpu|--quantize,
+      - distilbert|coreml_fp32_gpu|--quantize,
     strategy:
       matrix:
-        config: [
-          qwen3|coreml_fp32_gpu|--quantize,
-          smollm|coreml_fp32_gpu|--quantize,
-          llama3|coreml_fp32_gpu|--quantize,
-          olmo|coreml_fp32_gpu|--quantize,
-          # roberta|coreml_fp32_gpu|--quantize, roberta requires special HF access
-          bert|coreml_fp32_gpu|--quantize,
-          distilbert|coreml_fp32_gpu|--quantize,
-        ]
+        config:
+          - *xnnpack
+          - *coreml
       fail-fast: false
     with:
       secrets-env: EXECUTORCH_HF_TOKEN

From 79b2841f94133df95b208a363702771030ae13eb Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 13 Aug 2025 21:04:53 -0700
Subject: [PATCH 2/7] Fix trunk.yml formatting

---
 .github/workflows/trunk.yml | 60 +++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 12225c971c6..8ed07d44378 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -741,20 +741,18 @@ jobs:
       id-token: write
       contents: read
     secrets: inherit
-    # Models below selected based on https://huggingface.co/models?pipeline_tag=text-generation&num_parameters=min:0,max:3B&sort=trending.
-    xnnpack: &xnnpack
-      - llama3.2-1b|xnnpack|--quantize
-      - qwen3-0.6b|xnnpack|--quantize
-      - qwen3-1.7b|xnnpack|--quantize
-      - gemma3-1b|xnnpack|--quantize
-      - phi-4-mini|xnnpack|--quantize
-      - smollm2-135m|xnnpack|--quantize
-      - smollm3-3b|xnnpack|--quantize
     strategy:
       matrix:
-        config:
-          - *xnnpack
-          - *coreml
+        config: [
+          # XNNPack.
+          llama3.2-1b|xnnpack|--quantize,
+          qwen3-0.6b|xnnpack|--quantize,
+          qwen3-1.7b|xnnpack|--quantize,
+          gemma3-1b|xnnpack|--quantize,
+          phi-4-mini|xnnpack|--quantize,
+          smollm2-135m|xnnpack|--quantize,
+          smollm3-3b|xnnpack|--quantize
+        ]
       fail-fast: false
     with:
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -834,28 +832,26 @@ jobs:
       contents: read
     secrets: inherit
     # Models below selected based on https://huggingface.co/models?pipeline_tag=text-generation&num_parameters=min:0,max:3B&sort=trending.
-    xnnpack: &xnnpack
-      - llama3.2-1b|xnnpack|--quantize
-      - qwen3-0.6b|xnnpack|--quantize
-      - qwen3-1.7b|xnnpack|--quantize
-      - gemma3-1b|xnnpack|--quantize
-      - phi-4-mini|xnnpack|--quantize
-      - smollm2-135m|xnnpack|--quantize
-      - smollm3-3b|xnnpack|--quantize
-    coreml: &coreml
-      - llama3.2-1b|coreml_fp32_gpu|--quantize,
-      - qwen3-0.6b|coreml_fp32_gpu|--quantize,
-      - qwen3-0.6b|coreml_fp32_gpu|--quantize,
-      - smollm2-135m|coreml_fp32_gpu|--quantize,
-      - olmo-1b|coreml_fp32_gpu|--quantize,
-      # - roberta|coreml_fp32_gpu|--quantize, roberta requires special HF access
-      - bert|coreml_fp32_gpu|--quantize,
-      - distilbert|coreml_fp32_gpu|--quantize,
     strategy:
       matrix:
-        config:
-          - *xnnpack
-          - *coreml
+        config: [
+          # XNNPack.
+          llama3.2-1b|xnnpack|--quantize,
+          qwen3-0.6b|xnnpack|--quantize,
+          qwen3-1.7b|xnnpack|--quantize,
+          gemma3-1b|xnnpack|--quantize,
+          phi-4-mini|xnnpack|--quantize,
+          smollm2-135m|xnnpack|--quantize,
+          smollm3-3b|xnnpack|--quantize,
+          # CoreML.
+          llama3.2-1b|coreml_fp32_gpu|--quantize,
+          qwen3-0.6b|coreml_fp32_gpu|--quantize,
+          qwen3-1.7b|xnnpack|--quantize,
+          smollm2-135m|coreml_fp32_gpu|--quantize,
+          olmo-1b|coreml_fp32_gpu|--quantize,
+          bert|coreml_fp32_gpu|--quantize,
+          distilbert|coreml_fp32_gpu|--quantize
+        ]
       fail-fast: false
     with:
       secrets-env: EXECUTORCH_HF_TOKEN

From d993828939fd98c3bd6ebd0664ccd4aa812a81ec Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 09:51:36 -0700
Subject: [PATCH 3/7] Fix trunk.yml

---
 .github/workflows/trunk.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 8ed07d44378..f528e3845e5 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -807,7 +807,8 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Run tests"
-        python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE}
+        export OUTPUT_DIR="$(pwd)/${MODEL_ID}_${RECIPE}_${QUANTIZE}"
+        python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} --model_dir ${OUTPUT_DIR}
         echo "::endgroup::"
 
         echo "::group::Generate artifacts for performance profiling"

From 9eefe7fffe4ee6d0575829e4eb870eff442f7144 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 09:52:50 -0700
Subject: [PATCH 4/7] Try pinning llvm-openmp

---
 .ci/docker/conda-env-ci.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/conda-env-ci.txt b/.ci/docker/conda-env-ci.txt
index 995f9635a0d..bcde7fd3fb8 100644
--- a/.ci/docker/conda-env-ci.txt
+++ b/.ci/docker/conda-env-ci.txt
@@ -1,5 +1,5 @@
 cmake=3.31.2
 ninja=1.10.2
 libuv
-llvm-openmp
+llvm-openmp=19.1.7
 pkg-config

From 36e718d544bad7536d679220d2c4afc0a50157ac Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Thu, 14 Aug 2025 17:29:06 -0700
Subject: [PATCH 5/7] Revert "Try pinning llvm-openmp"

This reverts commit 9eefe7fffe4ee6d0575829e4eb870eff442f7144.
---
 .ci/docker/conda-env-ci.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/conda-env-ci.txt b/.ci/docker/conda-env-ci.txt
index bcde7fd3fb8..995f9635a0d 100644
--- a/.ci/docker/conda-env-ci.txt
+++ b/.ci/docker/conda-env-ci.txt
@@ -1,5 +1,5 @@
 cmake=3.31.2
 ninja=1.10.2
 libuv
-llvm-openmp=19.1.7
+llvm-openmp
 pkg-config

From cd3b4fb34957007c028cbd95a7aa9e1bd466c20f Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 15 Aug 2025 08:48:05 -0700
Subject: [PATCH 6/7] Fix trunk

---
 .github/workflows/trunk.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index f528e3845e5..ee17524acce 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -749,7 +749,7 @@ jobs:
           qwen3-0.6b|xnnpack|--quantize,
           qwen3-1.7b|xnnpack|--quantize,
           gemma3-1b|xnnpack|--quantize,
-          phi-4-mini|xnnpack|--quantize,
+          phi4-mini|xnnpack|--quantize,
           smollm2-135m|xnnpack|--quantize,
           smollm3-3b|xnnpack|--quantize
         ]
@@ -807,7 +807,7 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Run tests"
-        export OUTPUT_DIR="$(pwd)/${MODEL_ID}_${RECIPE}_${QUANTIZE}"
+        export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}"
         python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} --model_dir ${OUTPUT_DIR}
         echo "::endgroup::"
 
@@ -816,7 +816,7 @@ jobs:
           --model_path ${OUTPUT_DIR}/model.pte \
           --etdump_path ${OUTPUT_DIR}/etdump.etdp
 
-        export TSV_PATH=artifacts-to-be-uploaded/${MODEL_ID}_op_prof.tsv
+        export TSV_PATH=artifacts-to-be-uploaded/${MODEL}_op_prof.tsv
         mkdir -p $(dirname "$TSV_PATH")
         python3 -m devtools.inspector.inspector_cli \
           --etdump_path ${OUTPUT_DIR}/etdump.etdp \
@@ -841,7 +841,7 @@ jobs:
           qwen3-0.6b|xnnpack|--quantize,
           qwen3-1.7b|xnnpack|--quantize,
           gemma3-1b|xnnpack|--quantize,
-          phi-4-mini|xnnpack|--quantize,
+          phi4-mini|xnnpack|--quantize,
           smollm2-135m|xnnpack|--quantize,
           smollm3-3b|xnnpack|--quantize,
           # CoreML.

From a0c373fdc77737495ffa1f5ca3f805e70d335fc3 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Fri, 15 Aug 2025 09:16:08 -0700
Subject: [PATCH 7/7] Use custom kv cache and sdpa for xnnpack

---
 .ci/scripts/test_huggingface_optimum_model.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py
index 589d49ef475..cd7a7c2124e 100644
--- a/.ci/scripts/test_huggingface_optimum_model.py
+++ b/.ci/scripts/test_huggingface_optimum_model.py
@@ -107,6 +107,10 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
         model_dir,
     ]
     if "xnnpack" in recipe:
+        command += [
+            "--use_custom_sdpa",
+            "--use_custom_kv_cache",
+        ]
         if quantize:
             command += [
                 "--qlinear",