Skip to content

Commit 21b28c1

Browse files
authored
Merge branch 'main' into Arm-backend-Update-weight-observer-for-QAT
2 parents e1fa331 + 5d63bad commit 21b28c1

File tree

197 files changed

+6219
-4033
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

197 files changed

+6219
-4033
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
e7152ff8a6a929a0db7f3f4a72a5b6d471769cd3
1+
4d4abec80f03cd8fdefe1d9cb3a60d3690cd777e
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
# -------------------------
5+
# Args / flags
6+
# -------------------------
7+
TEST_WITH_RUNNER=0
8+
MODEL_NAME=""
9+
10+
# Parse args
11+
if [[ $# -lt 1 ]]; then
12+
echo "Usage: $0 <model_name> [--test_with_runner]"
13+
echo "Supported model_name values: qwen3_4b, phi_4_mini"
14+
exit 1
15+
fi
16+
17+
MODEL_NAME="$1"
18+
shift
19+
20+
while [[ $# -gt 0 ]]; do
21+
case "$1" in
22+
--test_with_runner)
23+
TEST_WITH_RUNNER=1
24+
;;
25+
-h|--help)
26+
echo "Usage: $0 <model_name> [--test_with_runner]"
27+
echo " model_name: qwen3_4b | phi_4_mini"
28+
echo " --test_with_runner: build ET + run llama_main to sanity-check the export"
29+
exit 0
30+
;;
31+
*)
32+
echo "Unknown option: $1"
33+
exit 1
34+
;;
35+
esac
36+
shift
37+
done
38+
39+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
40+
PYTHON_EXECUTABLE=python3
41+
fi
42+
43+
MODEL_OUT=model.pte
44+
45+
case "$MODEL_NAME" in
46+
qwen3_4b)
47+
echo "Running Qwen3-4B export..."
48+
HF_MODEL_DIR=$(hf download pytorch/Qwen3-4B-INT8-INT4)
49+
EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
50+
$PYTHON_EXECUTABLE -m executorch.examples.models.qwen3.convert_weights \
51+
$HF_MODEL_DIR \
52+
pytorch_model_converted.bin
53+
54+
$PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
55+
--model "qwen3_4b" \
56+
--checkpoint pytorch_model_converted.bin \
57+
--params examples/models/qwen3/config/4b_config.json \
58+
--output_name $MODEL_OUT \
59+
-kv \
60+
--use_sdpa_with_kv_cache \
61+
-X \
62+
--xnnpack-extended-ops \
63+
--max_context_length 1024 \
64+
--max_seq_length 1024 \
65+
--dtype fp32 \
66+
--metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
67+
;;
68+
69+
phi_4_mini)
70+
echo "Running Phi-4-mini export..."
71+
HF_MODEL_DIR=$(hf download pytorch/Phi-4-mini-instruct-INT8-INT4)
72+
EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
73+
$PYTHON_EXECUTABLE -m executorch.examples.models.phi_4_mini.convert_weights \
74+
$HF_MODEL_DIR \
75+
pytorch_model_converted.bin
76+
77+
$PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
78+
--model "phi_4_mini" \
79+
--checkpoint pytorch_model_converted.bin \
80+
--params examples/models/phi_4_mini/config/config.json \
81+
--output_name $MODEL_OUT \
82+
-kv \
83+
--use_sdpa_with_kv_cache \
84+
-X \
85+
--xnnpack-extended-ops \
86+
--max_context_length 1024 \
87+
--max_seq_length 1024 \
88+
--dtype fp32 \
89+
--metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
90+
;;
91+
92+
*)
93+
echo "Error: unsupported model_name '$MODEL_NAME'"
94+
echo "Supported values: qwen3_4b, phi_4_mini"
95+
exit 1
96+
;;
97+
esac
98+
99+
# Check file size
100+
MODEL_SIZE=$(stat --printf="%s" $MODEL_OUT 2>/dev/null || stat -f%z $MODEL_OUT)
101+
if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then
102+
echo "Error: model size $MODEL_SIZE is greater than expected upper bound $EXPECTED_MODEL_SIZE_UPPER_BOUND"
103+
exit 1
104+
fi
105+
106+
# Install ET with CMake
107+
if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
108+
echo "[runner] Building and testing llama_main ..."
109+
cmake -DPYTHON_EXECUTABLE=python \
110+
-DCMAKE_INSTALL_PREFIX=cmake-out \
111+
-DEXECUTORCH_ENABLE_LOGGING=1 \
112+
-DCMAKE_BUILD_TYPE=Release \
113+
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
114+
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
115+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
116+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
117+
-DEXECUTORCH_BUILD_XNNPACK=ON \
118+
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
119+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
120+
-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
121+
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
122+
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
123+
-Bcmake-out .
124+
cmake --build cmake-out -j16 --config Release --target install
125+
126+
127+
# Install llama runner
128+
cmake -DPYTHON_EXECUTABLE=python \
129+
-DCMAKE_BUILD_TYPE=Release \
130+
-Bcmake-out/examples/models/llama \
131+
examples/models/llama
132+
cmake --build cmake-out/examples/models/llama -j16 --config Release
133+
134+
# Run the model
135+
./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path="${HF_MODEL_DIR}/tokenizer.json" --prompt="Once upon a time,"
136+
fi
137+
138+
# Clean up
139+
rm -f pytorch_model_converted.bin "$MODEL_OUT"

.github/workflows/pull.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,13 @@ jobs:
971971
./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear
972972
./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
973973
974+
# "Classic" Operator tests
975+
PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build
976+
# TODO(ssjia): figure out how to run custom op tests in CI. Currently, they are
977+
# failing due to to the libstdc++.so.6 installed with conda not supporting
978+
# GLIBCXX_3.4.30. These tests are still run in Meta internal CI.
979+
# ./cmake-out/backends/vulkan/test/op_tests/vulkan_sdpa_test
980+
974981
# Run e2e testing for selected operators. More operators will be tested via this
975982
# route in the future.
976983
python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*pt2e*"

.github/workflows/trunk.yml

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ on:
88
tags:
99
- ciflow/trunk/*
1010
pull_request:
11+
paths:
12+
- .ci/docker/ci_commit_pins/pytorch.txt
13+
- .ci/scripts/**
1114
workflow_dispatch:
1215

1316
concurrency:
@@ -582,6 +585,37 @@ jobs:
582585
# Test llama2
583586
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh -model stories110M -build_tool cmake -dtype "${DTYPE}" -mode "${MODE}"
584587
588+
test-torchao-huggingface-checkpoints:
589+
name: test-torchao-huggingface-checkpoints
590+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
591+
permissions:
592+
id-token: write
593+
contents: read
594+
strategy:
595+
matrix:
596+
model: [qwen3_4b, phi_4_mini]
597+
include:
598+
- model: qwen3_4b
599+
test_with_runner: true
600+
- model: phi_4_mini
601+
test_with_runner: false
602+
fail-fast: false
603+
with:
604+
runner: linux.2xlarge
605+
docker-image: ci-image:executorch-ubuntu-22.04-clang12
606+
submodules: 'recursive'
607+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
608+
timeout: 900
609+
script: |
610+
# The generic Linux job chooses to use base env, not the one setup by the image
611+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
612+
conda activate "${CONDA_ENV}"
613+
614+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
615+
pip install -U "huggingface_hub[cli]"
616+
617+
bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }}
618+
585619
# # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
586620
# test-llava-runner-macos:
587621
# name: test-llava-runner-macos
@@ -990,13 +1024,13 @@ jobs:
9901024
timeout: 60
9911025
script: |
9921026
conda init powershell
993-
1027+
9941028
powershell -Command "& {
9951029
Set-PSDebug -Trace 1
9961030
\$ErrorActionPreference = 'Stop'
9971031
\$PSNativeCommandUseErrorActionPreference = \$true
9981032
999-
.ci/scripts/setup-windows.ps1
1033+
.ci/scripts/setup-windows.ps1
10001034
10011035
powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
10021036
}"

backends/apple/coreml/recipes/coreml_recipe_provider.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def create_recipe(
6969
recipe_type, activation_dtype=torch.float32, **kwargs
7070
)
7171
elif recipe_type == CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL:
72+
self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
7273
return self._build_torchao_quantized_recipe(
7374
recipe_type,
7475
weight_dtype=torch.int4,
@@ -77,6 +78,7 @@ def create_recipe(
7778
)
7879
elif recipe_type == CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP:
7980
group_size = kwargs.pop("group_size", 32)
81+
self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
8082
return self._build_torchao_quantized_recipe(
8183
recipe_type,
8284
weight_dtype=torch.int4,
@@ -85,11 +87,14 @@ def create_recipe(
8587
**kwargs,
8688
)
8789
elif recipe_type == CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL:
90+
self._validate_and_set_deployment_target(kwargs, ct.target.iOS16, "torchao")
8891
return self._build_torchao_quantized_recipe(
8992
recipe_type, weight_dtype=torch.int8, is_per_channel=True, **kwargs
9093
)
9194
elif recipe_type == CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP:
9295
group_size = kwargs.pop("group_size", 32)
96+
# override minimum_deployment_target to ios18 for torchao (GH issue #13122)
97+
self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
9398
return self._build_torchao_quantized_recipe(
9499
recipe_type,
95100
weight_dtype=torch.int8,
@@ -312,8 +317,6 @@ def _build_torchao_quantized_recipe(
312317
ao_quantization_configs=[config],
313318
)
314319

315-
# override minimum_deployment_target to ios18 for torchao (GH issue #13122)
316-
self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
317320
lowering_recipe = self._get_coreml_lowering_recipe(**kwargs)
318321

319322
return ExportRecipe(

backends/apple/coreml/test/test_coreml_quantizer.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
)
1616

1717
from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
18-
from torch.export import export_for_training
18+
from torch.export import export
1919
from torchao.quantization.pt2e.quantize_pt2e import (
2020
convert_pt2e,
2121
prepare_pt2e,
@@ -32,9 +32,7 @@ def quantize_and_compare(
3232
) -> None:
3333
assert quantization_type in {"PTQ", "QAT"}
3434

35-
pre_autograd_aten_dialect = export_for_training(
36-
model, example_inputs, strict=True
37-
).module()
35+
pre_autograd_aten_dialect = export(model, example_inputs, strict=True).module()
3836

3937
quantization_config = LinearQuantizerConfig.from_dict(
4038
{

backends/apple/coreml/test/test_coreml_recipes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,7 @@ def test_minimum_deployment_target_validation(self):
501501
(CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}),
502502
(
503503
CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL,
504-
ct.target.iOS18,
504+
ct.target.iOS16,
505505
{},
506506
),
507507
(CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}),

backends/apple/mps/test/test_mps_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def lower_module_and_test_output(
206206

207207
expected_output = model(*sample_inputs)
208208

209-
model = torch.export.export_for_training(
209+
model = torch.export.export(
210210
model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
211211
).module()
212212

backends/arm/README.md

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -206,14 +206,6 @@ The current TOSA version does not support int64. However, int64 is commonly used
206206
- For quantized models, these transformations will be automatically handled during annotation before the export stage.
207207

208208
List of model specific and optional passes:
209-
- InsertCastForOpsWithInt64InputPass
210-
- Functionality:
211-
- For LLMs such as LLama, some opeartors like aten.embedding have int64 input. In order to lower these operators to TOSA, this pass will insert a casting node that converts the input from int64 to int32.
212-
- Supported Ops:
213-
- aten.embedding.default, aten.slice_copy.Tensor
214-
- Example usage:
215-
- backends/arm/test/models/test_llama.py
216-
217209
- ConvertInt64ConstOpsToInt32Pass
218210
- Functionalities:
219211
- Rewrites constant-producing ops that output int64 to instead output int32, when values are within int32 bounds.
@@ -244,3 +236,16 @@ List of model specific and optional passes:
244236
- Example usage:
245237
- (Functionality 1) backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
246238
- (Functionality 2) backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
239+
240+
- InsertInt32CastsAfterInt64PlaceholdersPass
241+
- Functionalities:
242+
- Inserts an int64 -> int32 cast immediately after each int64 placeholder (graph input).
243+
- Redirects all uses of each int64 placeholder to its int32 cast output.
244+
- Inserts local int32 -> int64 casts at call sites where an operator requires int64 inputs, e.g. `torch.nn.functional.one_hot`
245+
- Pass ordering:
246+
- When used with `ConvertInt64ConstOpsToInt32Pass` and `ConvertInt64OutputOpsToInt32Pass`, run this pass last.
247+
- Rationale: Those passes may cause retracing to re-infer some int64 placeholders as int32. Running this pass last casts only inputs that remain int64, minimizing inserted casts.
248+
- Example usage:
249+
- backends/arm/test/models/test_llama.py
250+
- backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
251+
- backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py

backends/arm/_passes/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@
7575
from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass # noqa
7676
from .fuse_equal_placeholders_pass import FuseEqualPlaceholdersPass # noqa
7777
from .fuse_quantized_activation_pass import FuseQuantizedActivationPass # noqa
78-
from .insert_int64_input_cast_pass import ( # noqa # noqa
79-
InsertCastForOpsWithInt64InputPass,
78+
from .insert_int32_casts_after_int64_placeholders import ( # noqa
79+
InsertInt32CastsAfterInt64PlaceholdersPass,
8080
)
8181
from .insert_rescales_pass import InsertRescalePass # noqa
8282
from .insert_table_ops import InsertTableOpsPass # noqa

0 commit comments

Comments
 (0)