From 93d300e6604ae7344b75957598e091089a269864 Mon Sep 17 00:00:00 2001
From: Scott Roy <scroy@meta.com>
Date: Thu, 4 Sep 2025 20:08:11 -0700
Subject: [PATCH 01/16] Support pytorch_bin format

---
 examples/models/phi_4_mini/convert_weights.py | 10 +++++++---
 examples/models/qwen3/convert_weights.py      |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/models/phi_4_mini/convert_weights.py b/examples/models/phi_4_mini/convert_weights.py
index 01b7302ed2e..1fd840c75e3 100644
--- a/examples/models/phi_4_mini/convert_weights.py
+++ b/examples/models/phi_4_mini/convert_weights.py
@@ -95,9 +95,13 @@ def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.T
     converted_state_dict = {}
     inverted_mapping_dict = {v: k for k, v in _PHI_4_FROM_META.items()}
 
-    for key, value in state_dict.items():
-        new_key = get_mapped_key(key, inverted_mapping_dict)
-        converted_state_dict[new_key] = value
+    # Single checkpoint
+    model_path = os.path.join(input_dir, "pytorch_model.bin")
+    if os.path.exists(model_path):
+        state_dict = torch.load(
+            model_path, weights_only=True, map_location=torch.device("cpu")
+        )
+        return state_dict
 
     # Input and output embeddings are tied.
     converted_state_dict["output.weight"] = converted_state_dict[
diff --git a/examples/models/qwen3/convert_weights.py b/examples/models/qwen3/convert_weights.py
index 404fd4cbe88..e905d435ac0 100644
--- a/examples/models/qwen3/convert_weights.py
+++ b/examples/models/qwen3/convert_weights.py
@@ -89,6 +89,8 @@ def load_checkpoint_from_safetensors(input_dir: str) -> Dict:
 
     raise FileNotFoundError(f"Could not find safetensors checkpoint in {input_dir}")
 
+    raise FileNotFoundError(f"Could not find pytorch_model checkpoint in {input_dir}")
+
 
 def load_checkpoint(input_dir: str) -> Dict:
     try:

From 7e2fbd53e315a1ea18d55237c4c2cabc0a1e1b27 Mon Sep 17 00:00:00 2001
From: Scott Roy <scroy@meta.com>
Date: Thu, 4 Sep 2025 20:10:53 -0700
Subject: [PATCH 02/16] up

---
 examples/models/phi_4_mini/convert_weights.py | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/examples/models/phi_4_mini/convert_weights.py b/examples/models/phi_4_mini/convert_weights.py
index 1fd840c75e3..51520a9bba5 100644
--- a/examples/models/phi_4_mini/convert_weights.py
+++ b/examples/models/phi_4_mini/convert_weights.py
@@ -66,6 +66,29 @@ def phi_4_hf_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten
     return converted_state_dict
 
 
+def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from torchtune's format to Meta's format. This function
+    doesn't handle any sharding or splitting of state dicts. It follows the
+    state_dict IN -> state_dict OUT pattern.
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
+    Returns:
+        Dict[str, torch.Tensor]: State dict in Meta's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _PHI_4_FROM_META.items()}
+
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, inverted_mapping_dict)
+        converted_state_dict[new_key] = value
+
+    # Input and output embeddings are tied.
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
+
+
 # Standard _FROM_META weight mapping of Meta weights to TorchTune.
 _PHI_4_FROM_META = {
     "tok_embeddings.weight": "tok_embeddings.weight",

From ab15cf7a311304db1b3f27351d10e98c8a55dbd6 Mon Sep 17 00:00:00 2001
From: Scott Roy <scroy@meta.com>
Date: Thu, 4 Sep 2025 20:12:47 -0700
Subject: [PATCH 03/16] up

---
 examples/models/phi_4_mini/convert_weights.py | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/examples/models/phi_4_mini/convert_weights.py b/examples/models/phi_4_mini/convert_weights.py
index 51520a9bba5..1bae99ec03a 100644
--- a/examples/models/phi_4_mini/convert_weights.py
+++ b/examples/models/phi_4_mini/convert_weights.py
@@ -66,6 +66,22 @@ def phi_4_hf_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten
     return converted_state_dict
 
 
+# Standard _FROM_META weight mapping of Meta weights to TorchTune.
+_PHI_4_FROM_META = {
+    "tok_embeddings.weight": "tok_embeddings.weight",
+    "norm.weight": "norm.scale",
+    "layers.{}.attention.wk.weight": "layers.{}.attn.k_proj.weight",
+    "layers.{}.attention.wq.weight": "layers.{}.attn.q_proj.weight",
+    "layers.{}.attention.wv.weight": "layers.{}.attn.v_proj.weight",
+    "layers.{}.attention.wo.weight": "layers.{}.attn.output_proj.weight",
+    "layers.{}.attention_norm.weight": "layers.{}.sa_norm.scale",
+    "layers.{}.ffn_norm.weight": "layers.{}.mlp_norm.scale",
+    "layers.{}.feed_forward.w1.weight": "layers.{}.mlp.w1.weight",
+    "layers.{}.feed_forward.w2.weight": "layers.{}.mlp.w2.weight",
+    "layers.{}.feed_forward.w3.weight": "layers.{}.mlp.w3.weight",
+}
+
+
 def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
     """
     Convert a state dict from torchtune's format to Meta's format. This function
@@ -87,22 +103,7 @@ def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.T
     converted_state_dict["output.weight"] = converted_state_dict[
         "tok_embeddings.weight"
     ]
-
-
-# Standard _FROM_META weight mapping of Meta weights to TorchTune.
-_PHI_4_FROM_META = {
-    "tok_embeddings.weight": "tok_embeddings.weight",
-    "norm.weight": "norm.scale",
-    "layers.{}.attention.wk.weight": "layers.{}.attn.k_proj.weight",
-    "layers.{}.attention.wq.weight": "layers.{}.attn.q_proj.weight",
-    "layers.{}.attention.wv.weight": "layers.{}.attn.v_proj.weight",
-    "layers.{}.attention.wo.weight": "layers.{}.attn.output_proj.weight",
-    "layers.{}.attention_norm.weight": "layers.{}.sa_norm.scale",
-    "layers.{}.ffn_norm.weight": "layers.{}.mlp_norm.scale",
-    "layers.{}.feed_forward.w1.weight": "layers.{}.mlp.w1.weight",
-    "layers.{}.feed_forward.w2.weight": "layers.{}.mlp.w2.weight",
-    "layers.{}.feed_forward.w3.weight": "layers.{}.mlp.w3.weight",
-}
+    return converted_state_dict
 
 
 def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:

From 024b0da461f657d6abe596891b754adf0d48fc17 Mon Sep 17 00:00:00 2001
From: Scott Roy <scroy@meta.com>
Date: Sat, 6 Sep 2025 00:21:11 -0700
Subject: [PATCH 04/16] up

---
 examples/models/checkpoint.py                 | 2 ++
 examples/models/phi_4_mini/convert_weights.py | 2 +-
 examples/models/qwen3/convert_weights.py      | 4 +---
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/models/checkpoint.py b/examples/models/checkpoint.py
index 4583b105732..53a3b85d385 100644
--- a/examples/models/checkpoint.py
+++ b/examples/models/checkpoint.py
@@ -11,6 +11,8 @@
 import os
 from pathlib import Path
 from typing import Any, Dict, Optional
+import os
+import json
 
 import torch
 
diff --git a/examples/models/phi_4_mini/convert_weights.py b/examples/models/phi_4_mini/convert_weights.py
index 1bae99ec03a..c7bf515494c 100644
--- a/examples/models/phi_4_mini/convert_weights.py
+++ b/examples/models/phi_4_mini/convert_weights.py
@@ -2,9 +2,9 @@
 from typing import Dict
 
 import torch
-from executorch.examples.models.checkpoint import load_checkpoint_from_pytorch_model
 
 from torchtune.models.convert_weights import get_mapped_key
+from executorch.examples.models.checkpoint import load_checkpoint_from_pytorch_model
 
 from torchtune.training import FullModelHFCheckpointer
 
diff --git a/examples/models/qwen3/convert_weights.py b/examples/models/qwen3/convert_weights.py
index e905d435ac0..70d65ab494f 100644
--- a/examples/models/qwen3/convert_weights.py
+++ b/examples/models/qwen3/convert_weights.py
@@ -9,6 +9,7 @@
 from safetensors.torch import load_file
 
 from torchtune.models.convert_weights import get_mapped_key
+from executorch.examples.models.checkpoint import load_checkpoint_from_pytorch_model
 
 # Standard _FROM_META weight mapping of Meta weights to TorchTune + additional bias weight mappings.
 _QWEN_3_FROM_META = {
@@ -89,9 +90,6 @@ def load_checkpoint_from_safetensors(input_dir: str) -> Dict:
 
     raise FileNotFoundError(f"Could not find safetensors checkpoint in {input_dir}")
 
-    raise FileNotFoundError(f"Could not find pytorch_model checkpoint in {input_dir}")
-
-
 def load_checkpoint(input_dir: str) -> Dict:
     try:
         print("Loading checkpoint from pytorch_model directory")

From 57dbf96d2a2e51983c1a64e5642a8698e85a883b Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Sun, 7 Sep 2025 16:49:40 -0700
Subject: [PATCH 05/16] lint

---
 examples/models/checkpoint.py                 | 2 --
 examples/models/phi_4_mini/convert_weights.py | 2 +-
 examples/models/qwen3/convert_weights.py      | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/models/checkpoint.py b/examples/models/checkpoint.py
index 53a3b85d385..4583b105732 100644
--- a/examples/models/checkpoint.py
+++ b/examples/models/checkpoint.py
@@ -11,8 +11,6 @@
 import os
 from pathlib import Path
 from typing import Any, Dict, Optional
-import os
-import json
 
 import torch
 
diff --git a/examples/models/phi_4_mini/convert_weights.py b/examples/models/phi_4_mini/convert_weights.py
index c7bf515494c..1bae99ec03a 100644
--- a/examples/models/phi_4_mini/convert_weights.py
+++ b/examples/models/phi_4_mini/convert_weights.py
@@ -2,9 +2,9 @@
 from typing import Dict
 
 import torch
+from executorch.examples.models.checkpoint import load_checkpoint_from_pytorch_model
 
 from torchtune.models.convert_weights import get_mapped_key
-from executorch.examples.models.checkpoint import load_checkpoint_from_pytorch_model
 
 from torchtune.training import FullModelHFCheckpointer
 
diff --git a/examples/models/qwen3/convert_weights.py b/examples/models/qwen3/convert_weights.py
index 70d65ab494f..4d492e90fc4 100644
--- a/examples/models/qwen3/convert_weights.py
+++ b/examples/models/qwen3/convert_weights.py
@@ -9,7 +9,6 @@
 from safetensors.torch import load_file
 
 from torchtune.models.convert_weights import get_mapped_key
-from executorch.examples.models.checkpoint import load_checkpoint_from_pytorch_model
 
 # Standard _FROM_META weight mapping of Meta weights to TorchTune + additional bias weight mappings.
 _QWEN_3_FROM_META = {

From 51f41cb1b3d0b54359b78ff83f725c398c49adb6 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Sun, 7 Sep 2025 17:08:26 -0700
Subject: [PATCH 06/16] Add quantized checkpoint tests

---
 .../test_torchao_huggingface_checkpoints.sh   | 109 ++++++++++++++++++
 .github/workflows/trunk.yml                   |  96 ++++++++++++++-
 2 files changed, 202 insertions(+), 3 deletions(-)
 create mode 100644 .ci/scripts/test_torchao_huggingface_checkpoints.sh

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
new file mode 100644
index 00000000000..12c182d8620
--- /dev/null
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+MODEL_NAME=${1:-}
+
+if [[ -z "$MODEL_NAME" ]]; then
+  echo "Usage: $0 <model_name>"
+  echo "Supported model_name values: qwen3_4b, phi_4_mini"
+  exit 1
+fi
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+MODEL_OUT=model.pte
+
+case "$MODEL_NAME" in
+  qwen3_4b)
+    echo "Running Qwen3-4B export..."
+    HF_MODEL_DIR=$(hf download metascroy/Qwen3-4B-INT8-INT4)
+    EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
+    $PYTHON_EXECUTABLE -m executorch.examples.models.qwen3.convert_weights \
+      $HF_MODEL_DIR \
+      pytorch_model_converted.bin
+
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
+      --model "qwen3_4b" \
+      --checkpoint pytorch_model_converted.bin \
+      --params examples/models/qwen3/config/4b_config.json \
+      --output_name $MODEL_OUT \
+      -kv \
+      --use_sdpa_with_kv_cache \
+      -X \
+      --xnnpack-extended-ops \
+      --max_context_length 1024 \
+      --max_seq_length 1024 \
+      --dtype fp32 \
+      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+    ;;
+
+  phi_4_mini)
+    echo "Running Phi-4-mini export..."
+    HF_MODEL_DIR=$(hf download metascroy/Phi-4-mini-instruct-INT8-INT4)
+    EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
+    $PYTHON_EXECUTABLE -m executorch.examples.models.phi_4_mini.convert_weights \
+      $HF_MODEL_DIR \
+      pytorch_model_converted.bin
+
+    # $PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
+    #   --model "phi_4_mini" \
+    #   --checkpoint pytorch_model_converted.bin \
+    #   --params examples/models/phi_4_mini/config/config.json \
+    #   --output_name $MODEL_OUT \
+    #   -kv \
+    #   --use_sdpa_with_kv_cache \
+    #   -X \
+    #   --xnnpack-extended-ops \
+    #   --max_context_length 1024 \
+    #   --max_seq_length 1024 \
+    #   --dtype fp32 \
+    #   --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+    ;;
+
+  *)
+    echo "Error: unsupported model_name '$MODEL_NAME'"
+    echo "Supported values: qwen3_4b, phi_4_mini"
+    exit 1
+    ;;
+esac
+
+# Check file size
+MODEL_SIZE=$(stat --printf="%s" $MODEL_OUT 2>/dev/null || stat -f%z $MODEL_OUT)
+if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then
+  echo "Error: model size $MODEL_SIZE is greater than expected upper bound $EXPECTED_MODEL_SIZE_UPPER_BOUND"
+  exit 1
+fi
+
+# Install ET with CMake
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=ON \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --config Release --target install
+
+# Install llama runner
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_BUILD_TYPE=Release \
+    -Bcmake-out/examples/models/llama \
+    examples/models/llama
+cmake --build cmake-out/examples/models/llama -j16 --config Release
+
+# Run the model
+./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path="${HF_MODEL_DIR}/tokenizer.json" --prompt="Once upon a time,"
+
+# Clean up
+rm pytorch_model_converted.bin
+rm $MODEL_OUT
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 251bb238f1b..a869435839b 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -550,6 +550,65 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}"
 
+  test-llama-runner-linux:
+    # Test Both linux x86 and linux aarch64
+    name: test-llama-runner-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      matrix:
+        dtype: [fp32]
+        mode: [portable, xnnpack+custom]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
+        include:
+          - dtype: bf16
+            mode: portable
+            runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+          - dtype: bf16
+            mode: portable
+            runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - dtype: bf16
+            mode: custom
+            runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+        # Excluding specific runner + docker image combinations that don't make sense:
+        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
+        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+      fail-fast: false
+    with:
+      runner: ${{ matrix.runner }}
+      docker-image: ci-image:${{ matrix.docker-image }}
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        DTYPE=${{ matrix.dtype }}
+        BUILD_TOOL="cmake"
+        MODE=${{ matrix.mode }}
+        ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${DTYPE}-${MODE}"
+        ARTIFACTS_DIR_NAME="${ARTIFACTS_DIR_NAME/+/-}"
+
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+        # Test llama2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}"
+
   test-llama-runner-macos:
     name: test-llama-runner-mac
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -585,6 +644,37 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh -model stories110M -build_tool cmake -dtype "${DTYPE}" -mode "${MODE}"
 
+  test-torchao_huggingface_checkpoints:
+    name: test-llama-runner-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      matrix:
+        mode: [xnnpack+custom]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+      fail-fast: false
+    with:
+      runner: ${{ matrix.runner }}
+      docker-image: ci-image:${{ matrix.docker-image }}
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        sh .ci/scripts/test_torchao_huggingface_checkpoints.sh phi_4_mini
+        sh .ci/scripts/test_torchao_huggingface_checkpoints.sh qwen3_4b
+
   # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
   # test-llava-runner-macos:
   #   name: test-llava-runner-macos
@@ -993,13 +1083,13 @@ jobs:
       timeout: 60
       script: |
         conda init powershell
-        
+
         powershell -Command "& {
           Set-PSDebug -Trace 1
           \$ErrorActionPreference = 'Stop'
           \$PSNativeCommandUseErrorActionPreference = \$true
 
-          .ci/scripts/setup-windows.ps1       
+          .ci/scripts/setup-windows.ps1
 
           powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
-        }"
\ No newline at end of file
+        }"

From 3af11d6074de7b0addefbce242836bdbc9a57e40 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Sun, 7 Sep 2025 17:11:42 -0700
Subject: [PATCH 07/16] up

---
 .github/workflows/trunk.yml | 65 ++-----------------------------------
 1 file changed, 3 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index a869435839b..9f2757bb21a 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -550,65 +550,6 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}"
 
-  test-llama-runner-linux:
-    # Test Both linux x86 and linux aarch64
-    name: test-llama-runner-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    strategy:
-      matrix:
-        dtype: [fp32]
-        mode: [portable, xnnpack+custom]
-        runner: [linux.2xlarge, linux.arm64.2xlarge]
-        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
-        include:
-          - dtype: bf16
-            mode: portable
-            runner: linux.2xlarge
-            docker-image: executorch-ubuntu-22.04-clang12
-          - dtype: bf16
-            mode: portable
-            runner: linux.arm64.2xlarge
-            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
-          - dtype: bf16
-            mode: custom
-            runner: linux.arm64.2xlarge
-            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
-        # Excluding specific runner + docker image combinations that don't make sense:
-        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
-        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
-        exclude:
-          - runner: linux.2xlarge
-            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
-          - runner: linux.arm64.2xlarge
-            docker-image: executorch-ubuntu-22.04-clang12
-      fail-fast: false
-    with:
-      runner: ${{ matrix.runner }}
-      docker-image: ci-image:${{ matrix.docker-image }}
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 900
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        DTYPE=${{ matrix.dtype }}
-        BUILD_TOOL="cmake"
-        MODE=${{ matrix.mode }}
-        ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${DTYPE}-${MODE}"
-        ARTIFACTS_DIR_NAME="${ARTIFACTS_DIR_NAME/+/-}"
-
-        # Setup executorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
-        # Install requirements for export_llama
-        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
-        # Test llama2
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}"
-
   test-llama-runner-macos:
     name: test-llama-runner-mac
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -644,8 +585,8 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh -model stories110M -build_tool cmake -dtype "${DTYPE}" -mode "${MODE}"
 
-  test-torchao_huggingface_checkpoints:
-    name: test-llama-runner-linux
+  test-torchao-huggingface-checkpoints:
+    name: test-torchao-huggingface-checkpoints
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -672,8 +613,8 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        sh .ci/scripts/test_torchao_huggingface_checkpoints.sh phi_4_mini
         sh .ci/scripts/test_torchao_huggingface_checkpoints.sh qwen3_4b
+        sh .ci/scripts/test_torchao_huggingface_checkpoints.sh phi_4_mini
 
   # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
   # test-llava-runner-macos:

From ba6fb4b184221cf9a1d602c6e74fe9150aae4ca4 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 8 Sep 2025 10:01:44 -0700
Subject: [PATCH 08/16] up

---
 examples/models/phi_4_mini/convert_weights.py | 28 -------------------
 examples/models/qwen3/convert_weights.py      |  1 +
 2 files changed, 1 insertion(+), 28 deletions(-)

diff --git a/examples/models/phi_4_mini/convert_weights.py b/examples/models/phi_4_mini/convert_weights.py
index 1bae99ec03a..01b7302ed2e 100644
--- a/examples/models/phi_4_mini/convert_weights.py
+++ b/examples/models/phi_4_mini/convert_weights.py
@@ -106,34 +106,6 @@ def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.T
     return converted_state_dict
 
 
-def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-    """
-    Convert a state dict from torchtune's format to Meta's format. This function
-    doesn't handle any sharding or splitting of state dicts. It follows the
-    state_dict IN -> state_dict OUT pattern.
-    Args:
-        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
-    Returns:
-        Dict[str, torch.Tensor]: State dict in Meta's format.
-    """
-    converted_state_dict = {}
-    inverted_mapping_dict = {v: k for k, v in _PHI_4_FROM_META.items()}
-
-    # Single checkpoint
-    model_path = os.path.join(input_dir, "pytorch_model.bin")
-    if os.path.exists(model_path):
-        state_dict = torch.load(
-            model_path, weights_only=True, map_location=torch.device("cpu")
-        )
-        return state_dict
-
-    # Input and output embeddings are tied.
-    converted_state_dict["output.weight"] = converted_state_dict[
-        "tok_embeddings.weight"
-    ]
-    return converted_state_dict
-
-
 def convert_weights(input_dir_or_checkpoint: str, output_file: str) -> None:
     try:
         sd = load_checkpoint_from_pytorch_model(input_dir_or_checkpoint)
diff --git a/examples/models/qwen3/convert_weights.py b/examples/models/qwen3/convert_weights.py
index 4d492e90fc4..404fd4cbe88 100644
--- a/examples/models/qwen3/convert_weights.py
+++ b/examples/models/qwen3/convert_weights.py
@@ -89,6 +89,7 @@ def load_checkpoint_from_safetensors(input_dir: str) -> Dict:
 
     raise FileNotFoundError(f"Could not find safetensors checkpoint in {input_dir}")
 
+
 def load_checkpoint(input_dir: str) -> Dict:
     try:
         print("Loading checkpoint from pytorch_model directory")

From c3b6f0e5fdd8430264bab2d96ad557f673bb4a18 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 8 Sep 2025 10:07:39 -0700
Subject: [PATCH 09/16] up

---
 .github/workflows/trunk.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 9f2757bb21a..e9f9787dbe2 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -612,6 +612,7 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
+        pip install -U "huggingface_hub[cli]"
 
         sh .ci/scripts/test_torchao_huggingface_checkpoints.sh qwen3_4b
         sh .ci/scripts/test_torchao_huggingface_checkpoints.sh phi_4_mini

From ed1561019b1dec4326177fdb14dd00fffa162b55 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 8 Sep 2025 10:30:08 -0700
Subject: [PATCH 10/16] up

---
 .../test_torchao_huggingface_checkpoints.sh   | 26 +++++++++----------
 .github/workflows/trunk.yml                   | 13 +++-------
 2 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index 12c182d8620..a9ffd3ffb3f 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -47,19 +47,19 @@ case "$MODEL_NAME" in
       $HF_MODEL_DIR \
       pytorch_model_converted.bin
 
-    # $PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
-    #   --model "phi_4_mini" \
-    #   --checkpoint pytorch_model_converted.bin \
-    #   --params examples/models/phi_4_mini/config/config.json \
-    #   --output_name $MODEL_OUT \
-    #   -kv \
-    #   --use_sdpa_with_kv_cache \
-    #   -X \
-    #   --xnnpack-extended-ops \
-    #   --max_context_length 1024 \
-    #   --max_seq_length 1024 \
-    #   --dtype fp32 \
-    #   --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
+      --model "phi_4_mini" \
+      --checkpoint pytorch_model_converted.bin \
+      --params examples/models/phi_4_mini/config/config.json \
+      --output_name $MODEL_OUT \
+      -kv \
+      --use_sdpa_with_kv_cache \
+      -X \
+      --xnnpack-extended-ops \
+      --max_context_length 1024 \
+      --max_seq_length 1024 \
+      --dtype fp32 \
+      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
     ;;
 
   *)
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index e9f9787dbe2..be99a712a39 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -594,13 +594,8 @@ jobs:
     strategy:
       matrix:
         mode: [xnnpack+custom]
-        runner: [linux.2xlarge, linux.arm64.2xlarge]
-        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
-        exclude:
-          - runner: linux.2xlarge
-            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
-          - runner: linux.arm64.2xlarge
-            docker-image: executorch-ubuntu-22.04-clang12
+        runner: [linux.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12]
       fail-fast: false
     with:
       runner: ${{ matrix.runner }}
@@ -614,8 +609,8 @@ jobs:
         conda activate "${CONDA_ENV}"
         pip install -U "huggingface_hub[cli]"
 
-        sh .ci/scripts/test_torchao_huggingface_checkpoints.sh qwen3_4b
-        sh .ci/scripts/test_torchao_huggingface_checkpoints.sh phi_4_mini
+        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh qwen3_4b
+        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh phi_4_mini
 
   # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
   # test-llava-runner-macos:

From 62b25ca44ad134bd26d9ad410c25c54ca66066d4 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 8 Sep 2025 10:40:34 -0700
Subject: [PATCH 11/16] up

---
 .github/workflows/trunk.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index be99a712a39..078c0cfcb0e 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -608,6 +608,7 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
         pip install -U "huggingface_hub[cli]"
+        python install_executorch.py
 
         bash .ci/scripts/test_torchao_huggingface_checkpoints.sh qwen3_4b
         bash .ci/scripts/test_torchao_huggingface_checkpoints.sh phi_4_mini

From e2a3abb58bcee4b28325f1d791a15c058b9a1c31 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 8 Sep 2025 10:56:04 -0700
Subject: [PATCH 12/16] up

---
 .github/workflows/trunk.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 078c0cfcb0e..870b964be1f 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -607,8 +607,9 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
         pip install -U "huggingface_hub[cli]"
-        python install_executorch.py
 
         bash .ci/scripts/test_torchao_huggingface_checkpoints.sh qwen3_4b
         bash .ci/scripts/test_torchao_huggingface_checkpoints.sh phi_4_mini

From 13004c2f7bd673d616d513ea4a0c3bac67ac34d7 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 8 Sep 2025 12:02:30 -0700
Subject: [PATCH 13/16] up

---
 .../test_torchao_huggingface_checkpoints.sh   | 91 ++++++++++++-------
 .github/workflows/trunk.yml                   | 16 ++--
 2 files changed, 69 insertions(+), 38 deletions(-)

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index a9ffd3ffb3f..ea82abc51a9 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -1,14 +1,41 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-MODEL_NAME=${1:-}
+# -------------------------
+# Args / flags
+# -------------------------
+TEST_WITH_RUNNER=0
+MODEL_NAME=""
 
-if [[ -z "$MODEL_NAME" ]]; then
-  echo "Usage: $0 <model_name>"
+# Parse args
+if [[ $# -lt 1 ]]; then
+  echo "Usage: $0 <model_name> [--test_with_runner]"
   echo "Supported model_name values: qwen3_4b, phi_4_mini"
   exit 1
 fi
 
+MODEL_NAME="$1"
+shift
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --test_with_runner)
+      TEST_WITH_RUNNER=1
+      ;;
+    -h|--help)
+      echo "Usage: $0 <model_name> [--test_with_runner]"
+      echo "  model_name: qwen3_4b | phi_4_mini"
+      echo "  --test_with_runner: build ET + run llama_main to sanity-check the export"
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1"
+      exit 1
+      ;;
+  esac
+  shift
+done
+
 if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
   PYTHON_EXECUTABLE=python3
 fi
@@ -77,33 +104,35 @@ if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then
 fi
 
 # Install ET with CMake
-cmake -DPYTHON_EXECUTABLE=python \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DEXECUTORCH_ENABLE_LOGGING=1 \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -Bcmake-out .
-cmake --build cmake-out -j16 --config Release --target install
-
-# Install llama runner
-cmake -DPYTHON_EXECUTABLE=python \
-    -DCMAKE_BUILD_TYPE=Release \
-    -Bcmake-out/examples/models/llama \
-    examples/models/llama
-cmake --build cmake-out/examples/models/llama -j16 --config Release
-
-# Run the model
-./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path="${HF_MODEL_DIR}/tokenizer.json" --prompt="Once upon a time,"
+if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
+  echo "[runner] Building and testing llama_main ..."
+    cmake -DPYTHON_EXECUTABLE=python \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DEXECUTORCH_ENABLE_LOGGING=1 \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
+        -Bcmake-out .
+    cmake --build cmake-out -j16 --config Release --target install
+
+    # Install llama runner
+    cmake -DPYTHON_EXECUTABLE=python \
+        -DCMAKE_BUILD_TYPE=Release \
+        -Bcmake-out/examples/models/llama \
+        examples/models/llama
+    cmake --build cmake-out/examples/models/llama -j16 --config Release
+
+    # Run the model
+    ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path="${HF_MODEL_DIR}/tokenizer.json" --prompt="Once upon a time,"
+fi
 
 # Clean up
-rm pytorch_model_converted.bin
-rm $MODEL_OUT
+rm -f pytorch_model_converted.bin "$MODEL_OUT"
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 870b964be1f..f5c5161e0cc 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -593,13 +593,16 @@ jobs:
       contents: read
     strategy:
       matrix:
-        mode: [xnnpack+custom]
-        runner: [linux.2xlarge]
-        docker-image: [executorch-ubuntu-22.04-clang12]
+        model: [qwen3_4b, phi_4_mini]
+        include:
+          - model: qwen3_4b
+            test_with_runner: true
+          - model: phi_4_mini
+            test_with_runner: false
       fail-fast: false
     with:
-      runner: ${{ matrix.runner }}
-      docker-image: ci-image:${{ matrix.docker-image }}
+      runner: linux.2xlarge
+      docker-image: ci-image:executorch-ubuntu-22.04-clang12
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 900
@@ -611,8 +614,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
         pip install -U "huggingface_hub[cli]"
 
-        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh qwen3_4b
-        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh phi_4_mini
+        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }}
 
   # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
   # test-llava-runner-macos:

From 1a8e1d7eaaa3bdd127c09c3ce0b4fa2d31b8358d Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 9 Sep 2025 12:22:00 -0700
Subject: [PATCH 14/16] up

---
 .../test_torchao_huggingface_checkpoints.sh   | 21 +++----------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index ea82abc51a9..3e400691d63 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -45,7 +45,7 @@ MODEL_OUT=model.pte
 case "$MODEL_NAME" in
   qwen3_4b)
     echo "Running Qwen3-4B export..."
-    HF_MODEL_DIR=$(hf download metascroy/Qwen3-4B-INT8-INT4)
+    HF_MODEL_DIR=$(hf download pytorch/Qwen3-4B-INT8-INT4)
     EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
     $PYTHON_EXECUTABLE -m executorch.examples.models.qwen3.convert_weights \
       $HF_MODEL_DIR \
@@ -68,7 +68,7 @@ case "$MODEL_NAME" in
 
   phi_4_mini)
     echo "Running Phi-4-mini export..."
-    HF_MODEL_DIR=$(hf download metascroy/Phi-4-mini-instruct-INT8-INT4)
+    HF_MODEL_DIR=$(hf download pytorch/Phi-4-mini-instruct-INT8-INT4)
     EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
     $PYTHON_EXECUTABLE -m executorch.examples.models.phi_4_mini.convert_weights \
       $HF_MODEL_DIR \
@@ -106,22 +106,7 @@ fi
 # Install ET with CMake
 if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
   echo "[runner] Building and testing llama_main ..."
-    cmake -DPYTHON_EXECUTABLE=python \
-        -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DEXECUTORCH_ENABLE_LOGGING=1 \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-        -DEXECUTORCH_BUILD_XNNPACK=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-        -Bcmake-out .
-    cmake --build cmake-out -j16 --config Release --target install
+    cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out
 
     # Install llama runner
     cmake -DPYTHON_EXECUTABLE=python \

From 7647ed9fa51488af40ae9d55fc648dc698f3aba0 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 9 Sep 2025 13:18:20 -0700
Subject: [PATCH 15/16] up

---
 .ci/scripts/test_torchao_huggingface_checkpoints.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index 3e400691d63..1a864a00697 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -107,12 +107,14 @@ fi
 if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
   echo "[runner] Building and testing llama_main ..."
     cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out
+    cmake --build cmake-out -j16 --target install --config Release
 
     # Install llama runner
-    cmake -DPYTHON_EXECUTABLE=python \
-        -DCMAKE_BUILD_TYPE=Release \
-        -Bcmake-out/examples/models/llama \
-        examples/models/llama
+    cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+      -DBUILD_TESTING=OFF \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Bcmake-out/examples/models/llama \
+      examples/models/llama
     cmake --build cmake-out/examples/models/llama -j16 --config Release
 
     # Run the model

From 2f47f54590cfd0efde081243dcec3a2c58051255 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 9 Sep 2025 14:30:13 -0700
Subject: [PATCH 16/16] up

---
 .../test_torchao_huggingface_checkpoints.sh   | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index 1a864a00697..c0910b47826 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -106,15 +106,29 @@ fi
 # Install ET with CMake
 if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
   echo "[runner] Building and testing llama_main ..."
-    cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out
-    cmake --build cmake-out -j16 --target install --config Release
+    cmake -DPYTHON_EXECUTABLE=python \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DEXECUTORCH_ENABLE_LOGGING=1 \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
+        -Bcmake-out .
+    cmake --build cmake-out -j16 --config Release --target install
+
 
     # Install llama runner
-    cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
-      -DBUILD_TESTING=OFF \
-      -DCMAKE_BUILD_TYPE=Release \
-      -Bcmake-out/examples/models/llama \
-      examples/models/llama
+    cmake -DPYTHON_EXECUTABLE=python \
+        -DCMAKE_BUILD_TYPE=Release \
+        -Bcmake-out/examples/models/llama \
+        examples/models/llama
     cmake --build cmake-out/examples/models/llama -j16 --config Release
 
     # Run the model