Skip to content

Commit 6f163fa

Browse files
authored
Merge branch 'main' into jz/move-modeling
2 parents 984ece0 + b11075f commit 6f163fa

File tree

95 files changed

+2433
-799
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+2433
-799
lines changed

.ci/scripts/test_model.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,15 @@ test_model() {
102102
bash examples/models/llama/install_requirements.sh
103103
# Test export_llm script: python3 -m extension.llm.export.export_llm.
104104
# Use Llama random checkpoint with Qwen 2.5 1.5b model configuration.
105-
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json
105+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/config/1_5b_config.json
106106
rm "./${MODEL_NAME}.pte"
107107
return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
108108
fi
109109
if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
110110
# Install requirements for export_llama
111111
bash examples/models/llama/install_requirements.sh
112112
# Test export_llm script: python3 -m extension.llm.export.export_llm.
113-
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json
113+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config/config.json
114114
run_portable_executor_runner
115115
rm "./${MODEL_NAME}.pte"
116116
return

.github/workflows/android-perf.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ jobs:
317317
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
318318
python -m extension.llm.export.export_llm \
319319
base.model_class=qwen3_0_6b \
320-
base.params=examples/models/qwen3/0_6b_config.json \
320+
base.params=examples/models/qwen3/config/0_6b_config.json \
321321
model.use_kv_cache=true \
322322
model.use_sdpa_with_kv_cache=true \
323323
model.dtype_override=fp32 \

.github/workflows/apple-perf.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ jobs:
322322
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
323323
${CONDA_RUN} python -m extension.llm.export.export_llm \
324324
base.model_class=qwen3_0_6b \
325-
base.params=examples/models/qwen3/0_6b_config.json \
325+
base.params=examples/models/qwen3/config/0_6b_config.json \
326326
model.use_kv_cache=true \
327327
model.use_sdpa_with_kv_cache=true \
328328
model.dtype_override=fp32 \

.github/workflows/trunk.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,3 +718,32 @@ jobs:
718718
build-mode: Release
719719
build-tool: cmake
720720
docker-image: executorch-ubuntu-22.04-clang12
721+
722+
unittest-nxp-neutron:
723+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
724+
permissions:
725+
id-token: write
726+
contents: read
727+
with:
728+
runner: linux.2xlarge
729+
docker-image: executorch-ubuntu-22.04-clang12
730+
submodules: 'recursive'
731+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
732+
timeout: 90
733+
script: |
734+
set -eux
735+
736+
# The generic Linux job chooses to use base env, not the one setup by the image
737+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
738+
conda activate "${CONDA_ENV}"
739+
740+
# Build and install Executorch
741+
PYTHON_EXECUTABLE=python \
742+
CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
743+
.ci/scripts/setup-linux.sh --build-tool "cmake"
744+
745+
# Install test requirements
746+
pip install -r backends/nxp/requirements-tests.txt
747+
748+
# Run pytest
749+
PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh

CMakeLists.txt

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,17 @@ announce_configured_options(CMAKE_TOOLCHAIN_FILE)
8686
load_build_preset()
8787
include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
8888

89+
# Enable ccache if available
90+
find_program(CCACHE_PROGRAM ccache)
91+
if(CCACHE_PROGRAM)
92+
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
93+
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
94+
message(STATUS "ccache found and enabled for faster builds")
95+
else()
96+
message(STATUS "ccache not found, builds will not be cached")
97+
endif()
98+
announce_configured_options(CCACHE_PROGRAM)
99+
89100
# Print all the configs that were called with announce_configured_options.
90101
print_configured_options()
91102

@@ -606,9 +617,9 @@ if(EXECUTORCH_BUILD_PYBIND)
606617
endif()
607618

608619
if(EXECUTORCH_BUILD_XNNPACK)
609-
# need to explicitly specify XNNPACK and microkernels-prod here otherwise
620+
# need to explicitly specify XNNPACK and xnnpack-microkernels-prod here otherwise
610621
# uses XNNPACK and microkernel-prod symbols from libtorch_cpu
611-
list(APPEND _dep_libs xnnpack_backend XNNPACK microkernels-prod)
622+
list(APPEND _dep_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
612623
endif()
613624

614625
# compile options for pybind

backends/arm/_passes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
from .convert_split_to_slice import ConvertSplitToSlicePass # noqa
2323
from .convert_squeezes_to_view import ConvertSqueezesToViewPass # noqa
2424
from .convert_to_clamp import ConvertToClampPass # noqa
25+
from .decompose_atan_pass import DecomposeAtanPass # noqa
2526
from .decompose_avg_pool2d import DecomposeAvgPool2d # noqa
27+
from .decompose_batch_norm_no_stats import DecomposeBatchNormNoStatsPass # noqa
2628
from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass # noqa
2729
from .decompose_div_pass import DecomposeDivPass # noqa
2830
from .decompose_embedding_pass import DecomposeEmbeddingPass # noqa # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
ConvertSplitToSlicePass,
2626
ConvertSqueezesToViewPass,
2727
ConvertToClampPass,
28+
DecomposeAtanPass,
2829
DecomposeAvgPool2d,
30+
DecomposeBatchNormNoStatsPass,
2931
DecomposeCosineSimilarityPass,
3032
DecomposeDivPass,
3133
DecomposeEmbeddingPass,
@@ -150,6 +152,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
150152
def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
151153
self.add_pass(DecomposeRoundPass())
152154
self.add_pass(DecomposeSqrtPass())
155+
self.add_pass(DecomposeAtanPass())
153156
self.add_pass(ConvertIntPowToMuls())
154157
self.add_pass(CastBoolToInt8Pass())
155158
self.add_pass(DecomposeSinhPass())
@@ -164,6 +167,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
164167
self.add_pass(DecomposeLeakyReLUPass())
165168
self.add_pass(DecomposeGroupNormPass())
166169
self.add_pass(DecomposeLayerNormPass())
170+
self.add_pass(DecomposeBatchNormNoStatsPass())
167171
self.add_pass(DecomposeVarPass())
168172
self.add_pass(
169173
DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
import logging
7+
from math import pi
8+
9+
from executorch.backends.arm._passes import ArmPass
10+
from executorch.exir.dialects._ops import ops as exir_ops
11+
12+
13+
edge_atan = exir_ops.edge.aten.atan.default # MI case
14+
15+
16+
def _get_atan_ops(op):
17+
"""Return the primitive ops required.."""
18+
if op is not edge_atan:
19+
raise RuntimeError(f"Can't decompose atan for op {op}")
20+
21+
return (
22+
exir_ops.edge.aten.mul.Tensor,
23+
exir_ops.edge.aten.mul.Scalar,
24+
exir_ops.edge.aten.add.Tensor,
25+
exir_ops.edge.aten.add.Scalar,
26+
exir_ops.edge.aten.sub.Tensor,
27+
exir_ops.edge.aten.abs.default,
28+
exir_ops.edge.aten.gt.Scalar,
29+
exir_ops.edge.aten.reciprocal.default,
30+
exir_ops.edge.aten.where.self,
31+
exir_ops.edge.aten.neg.default,
32+
)
33+
34+
35+
class DecomposeAtanPass(ArmPass):
36+
"""Decomposes the atan operator into a rational (Padé) approximation."""
37+
38+
def _rational_approximation(self, z, ops, meta):
39+
"""Creates a (2,1) Padé approximation for atan(x) on [-1, 1]."""
40+
41+
op_mul, op_mul_scalar, op_add, op_add_scalar, _, _, _, op_recip, _, _ = ops
42+
43+
# Coefficients calculated using minimax on the interval [-1, 1].
44+
a1 = 0.3529666667
45+
a2 = -0.0287666667
46+
b1 = 0.6863
47+
48+
z2 = super().call_operator(op_mul, (z, z), {}, meta, updated=True)
49+
z4 = super().call_operator(op_mul, (z2, z2), {}, meta, updated=True)
50+
51+
num1 = super().call_operator(op_mul_scalar, (z2, a1), {}, meta, updated=True)
52+
num2 = super().call_operator(op_mul_scalar, (z4, a2), {}, meta, updated=True)
53+
num = super().call_operator(op_add_scalar, (num1, 1.0), {}, meta, updated=True)
54+
num = super().call_operator(op_add, (num, num2), {}, meta, updated=True)
55+
56+
den1 = super().call_operator(op_mul_scalar, (z2, b1), {}, meta, updated=True)
57+
den = super().call_operator(op_add_scalar, (den1, 1.0), {}, meta, updated=True)
58+
59+
inv_den = super().call_operator(op_recip, (den,), {}, meta, updated=True)
60+
61+
prod = super().call_operator(op_mul, (num, inv_den), {}, meta, updated=True)
62+
return super().call_operator(op_mul, (z, prod), {}, meta, updated=True)
63+
64+
def call_operator(self, op, args, kwargs, meta):
65+
if op is not edge_atan:
66+
return super().call_operator(op, args, kwargs, meta, updated=False)
67+
68+
logging.info(
69+
f"Approximating atan. This may introduce small numerical errors. For details, see {__file__}."
70+
)
71+
72+
ops = _get_atan_ops(op)
73+
(
74+
_,
75+
op_mul_scalar,
76+
_,
77+
op_add_scalar,
78+
op_sub,
79+
op_abs,
80+
op_gt,
81+
op_recip,
82+
op_where,
83+
op_neg,
84+
) = ops
85+
86+
x = args[0]
87+
88+
# |x| > 1 is reduced to [0, 1] using atan(x) = pi/2 - atan(1/x) and atan(-x) = -atan(x).
89+
90+
abs_x = super().call_operator(op_abs, (x,), {}, meta, updated=True)
91+
mask_hi = super().call_operator(op_gt, (abs_x, 1.0), {}, meta, updated=True)
92+
93+
inv_x = super().call_operator(op_recip, (abs_x,), {}, meta, updated=True)
94+
z = super().call_operator(
95+
op_where, (mask_hi, inv_x, abs_x), {}, meta, updated=True
96+
)
97+
98+
atan_z = self._rational_approximation(z, ops, meta)
99+
100+
zero_tensor = super().call_operator(
101+
op_mul_scalar, (x, 0.0), {}, meta, updated=True
102+
)
103+
half_pi_tensor = super().call_operator(
104+
op_add_scalar, (zero_tensor, pi / 2), {}, meta, updated=True
105+
)
106+
107+
diff = super().call_operator(
108+
op_sub, (half_pi_tensor, atan_z), {}, meta, updated=True
109+
)
110+
atan_abs = super().call_operator(
111+
op_where, (mask_hi, diff, atan_z), {}, meta, updated=True
112+
)
113+
114+
mask_pos = super().call_operator(op_gt, (x, 0.0), {}, meta, updated=True)
115+
neg_val = super().call_operator(op_neg, (atan_abs,), {}, meta, updated=True)
116+
117+
return super().call_operator(
118+
op_where, (mask_pos, atan_abs, neg_val), {}, meta, updated=True
119+
)

0 commit comments

Comments
 (0)