Skip to content

Commit 641e9b5

Browse files
committed
Update on "[et] generate debug handle before opeartor decomposition"
This diff update the debug handle generation, from each node in the edge program having a individual debug handle, to all nodes having a same ancestor in export graph sharing a same debug handle, which update the start point of tracing our node transformation from edge graph to exported graph. Differential Revision: [D76860368](https://our.internmc.facebook.com/intern/diff/D76860368/) [ghstack-poisoned]
2 parents 3a66602 + fc5a110 commit 641e9b5

File tree

173 files changed

+3639
-1456
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

173 files changed

+3639
-1456
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
a3942627f5ac048e06b4b1d703b0a6a53bf6da5b

.ci/scripts/test_model.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,15 @@ test_model() {
102102
bash examples/models/llama/install_requirements.sh
103103
# Test export_llm script: python3 -m extension.llm.export.export_llm.
104104
# Use Llama random checkpoint with Qwen 2.5 1.5b model configuration.
105-
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json
105+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/config/1_5b_config.json
106106
rm "./${MODEL_NAME}.pte"
107107
return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
108108
fi
109109
if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
110110
# Install requirements for export_llama
111111
bash examples/models/llama/install_requirements.sh
112112
# Test export_llm script: python3 -m extension.llm.export.export_llm.
113-
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json
113+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config/config.json
114114
run_portable_executor_runner
115115
rm "./${MODEL_NAME}.pte"
116116
return

.github/workflows/android-perf-private-device-experiment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
60+
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
6161
devices: samsung_galaxy_s22_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/android-perf.yml

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7373
# during scheduled runs and to provide flexibility for different defaults between
7474
# on-demand and periodic benchmarking.
75-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
75+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
7676
CRON_DEFAULT_DEVICES: samsung_galaxy_s22
7777
run: |
7878
set -eux
@@ -317,7 +317,7 @@ jobs:
317317
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
318318
python -m extension.llm.export.export_llm \
319319
base.model_class=qwen3_0_6b \
320-
base.params=examples/models/qwen3/0_6b_config.json \
320+
base.params=examples/models/qwen3/config/0_6b_config.json \
321321
model.use_kv_cache=true \
322322
model.use_sdpa_with_kv_cache=true \
323323
model.dtype_override=fp32 \
@@ -341,10 +341,11 @@ jobs:
341341
echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
342342
343343
# Install optimum-executorch
344+
OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
344345
git clone https://github.com/huggingface/optimum-executorch
345346
pushd optimum-executorch
346347
# There is no release yet, for CI stability, always test from the same commit on main
347-
git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
348+
git checkout $OPTIMUM_ET_COMMIT
348349
python install_dev.py --skip_override_torch
349350
pip list
350351
@@ -353,21 +354,12 @@ jobs:
353354
"--task" "text-generation"
354355
"--recipe" "xnnpack"
355356
"--use_custom_sdpa"
357+
"--use_custom_kv_cache"
356358
"--qlinear"
357359
"--qembedding"
358360
"--output_dir" ".."
359361
)
360362
361-
# Add conditional arguments based on model
362-
case "${HF_MODEL_REPO}" in
363-
*"google/gemma-3-1b-it"*)
364-
echo "--use_custom_kv_cache can not be used for HybridCache"
365-
;;
366-
*)
367-
ARGS+=("--use_custom_kv_cache")
368-
;;
369-
esac
370-
371363
optimum-cli export executorch "${ARGS[@]}"
372364
popd
373365

.github/workflows/apple-perf-private-device-experiment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
60+
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
6161
devices: apple_iphone_15_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/apple-perf.yml

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7373
# during scheduled runs and to provide flexibility for different defaults between
7474
# on-demand and periodic benchmarking.
75-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
75+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
7676
CRON_DEFAULT_DEVICES: apple_iphone_15
7777
run: |
7878
set -eux
@@ -322,7 +322,7 @@ jobs:
322322
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
323323
${CONDA_RUN} python -m extension.llm.export.export_llm \
324324
base.model_class=qwen3_0_6b \
325-
base.params=examples/models/qwen3/0_6b_config.json \
325+
base.params=examples/models/qwen3/config/0_6b_config.json \
326326
model.use_kv_cache=true \
327327
model.use_sdpa_with_kv_cache=true \
328328
model.dtype_override=fp32 \
@@ -346,10 +346,11 @@ jobs:
346346
echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
347347
348348
# Install optimum-executorch
349+
OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
349350
git clone https://github.com/huggingface/optimum-executorch
350351
pushd optimum-executorch
351352
# There is no release yet, for CI stability, always test from the same commit on main
352-
git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
353+
git checkout $OPTIMUM_ET_COMMIT
353354
${CONDA_RUN} python install_dev.py --skip_override_torch
354355
pip list
355356
@@ -358,21 +359,12 @@ jobs:
358359
"--task" "text-generation"
359360
"--recipe" "xnnpack"
360361
"--use_custom_sdpa"
362+
"--use_custom_kv_cache"
361363
"--qlinear"
362364
"--qembedding"
363365
"--output_dir" ".."
364366
)
365367
366-
# Add conditional arguments based on model
367-
case "${HF_MODEL_REPO}" in
368-
*"google/gemma-3-1b-it"*)
369-
echo "--use_custom_kv_cache can not be used for HybridCache"
370-
;;
371-
*)
372-
ARGS+=("--use_custom_kv_cache")
373-
;;
374-
esac
375-
376368
${CONDA_RUN} optimum-cli export executorch "${ARGS[@]}"
377369
popd
378370

.github/workflows/trunk.yml

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -594,10 +594,11 @@ jobs:
594594
echo "::group::Set up Hugging Face"
595595
pip install -U "huggingface_hub[cli]"
596596
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
597+
OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
597598
git clone https://github.com/huggingface/optimum-executorch
598599
pushd optimum-executorch
599600
# There is no release yet, for CI stability, always test from the same commit on main
600-
git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
601+
git checkout $OPTIMUM_ET_COMMIT
601602
python install_dev.py --skip_override_torch
602603
popd
603604
pip list
@@ -614,21 +615,12 @@ jobs:
614615
"--task" "text-generation"
615616
"--recipe" "xnnpack"
616617
"--use_custom_sdpa"
618+
"--use_custom_kv_cache"
617619
"--qlinear"
618620
"--qembedding"
619621
"--output_dir" "${OUTPUT_DIR}"
620622
)
621623
622-
# Add conditional arguments based on model
623-
case "${MODEL_ID}" in
624-
*"google/gemma-3-1b-it"*)
625-
echo "--use_custom_kv_cache can not be used for HybridCache"
626-
;;
627-
*)
628-
ARGS+=("--use_custom_kv_cache")
629-
;;
630-
esac
631-
632624
optimum-cli export executorch "${ARGS[@]}"
633625
634626
ls -FlAGhp ${OUTPUT_DIR}
@@ -718,3 +710,32 @@ jobs:
718710
build-mode: Release
719711
build-tool: cmake
720712
docker-image: executorch-ubuntu-22.04-clang12
713+
714+
unittest-nxp-neutron:
715+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
716+
permissions:
717+
id-token: write
718+
contents: read
719+
with:
720+
runner: linux.2xlarge
721+
docker-image: executorch-ubuntu-22.04-clang12
722+
submodules: 'recursive'
723+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
724+
timeout: 90
725+
script: |
726+
set -eux
727+
728+
# The generic Linux job chooses to use base env, not the one setup by the image
729+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
730+
conda activate "${CONDA_ENV}"
731+
732+
# Build and install Executorch
733+
PYTHON_EXECUTABLE=python \
734+
CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
735+
.ci/scripts/setup-linux.sh --build-tool "cmake"
736+
737+
# Install test requirements
738+
pip install -r backends/nxp/requirements-tests.txt
739+
740+
# Run pytest
741+
PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh

CMakeLists.txt

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,17 @@ announce_configured_options(CMAKE_TOOLCHAIN_FILE)
8686
load_build_preset()
8787
include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
8888

89+
# Enable ccache if available
90+
find_program(CCACHE_PROGRAM ccache)
91+
if(CCACHE_PROGRAM)
92+
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
93+
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
94+
message(STATUS "ccache found and enabled for faster builds")
95+
else()
96+
message(STATUS "ccache not found, builds will not be cached")
97+
endif()
98+
announce_configured_options(CCACHE_PROGRAM)
99+
89100
# Print all the configs that were called with announce_configured_options.
90101
print_configured_options()
91102

@@ -606,9 +617,9 @@ if(EXECUTORCH_BUILD_PYBIND)
606617
endif()
607618

608619
if(EXECUTORCH_BUILD_XNNPACK)
609-
# need to explicitly specify XNNPACK and microkernels-prod here otherwise
620+
# need to explicitly specify XNNPACK and xnnpack-microkernels-prod here otherwise
610621
# uses XNNPACK and microkernel-prod symbols from libtorch_cpu
611-
list(APPEND _dep_libs xnnpack_backend XNNPACK microkernels-prod)
622+
list(APPEND _dep_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
612623
endif()
613624

614625
# compile options for pybind

backends/arm/_passes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
from .convert_split_to_slice import ConvertSplitToSlicePass # noqa
2323
from .convert_squeezes_to_view import ConvertSqueezesToViewPass # noqa
2424
from .convert_to_clamp import ConvertToClampPass # noqa
25+
from .decompose_atan_pass import DecomposeAtanPass # noqa
2526
from .decompose_avg_pool2d import DecomposeAvgPool2d # noqa
27+
from .decompose_batch_norm_no_stats import DecomposeBatchNormNoStatsPass # noqa
2628
from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass # noqa
2729
from .decompose_div_pass import DecomposeDivPass # noqa
2830
from .decompose_embedding_pass import DecomposeEmbeddingPass # noqa # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
ConvertSplitToSlicePass,
2626
ConvertSqueezesToViewPass,
2727
ConvertToClampPass,
28+
DecomposeAtanPass,
2829
DecomposeAvgPool2d,
30+
DecomposeBatchNormNoStatsPass,
2931
DecomposeCosineSimilarityPass,
3032
DecomposeDivPass,
3133
DecomposeEmbeddingPass,
@@ -150,6 +152,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
150152
def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
151153
self.add_pass(DecomposeRoundPass())
152154
self.add_pass(DecomposeSqrtPass())
155+
self.add_pass(DecomposeAtanPass())
153156
self.add_pass(ConvertIntPowToMuls())
154157
self.add_pass(CastBoolToInt8Pass())
155158
self.add_pass(DecomposeSinhPass())
@@ -164,6 +167,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
164167
self.add_pass(DecomposeLeakyReLUPass())
165168
self.add_pass(DecomposeGroupNormPass())
166169
self.add_pass(DecomposeLayerNormPass())
170+
self.add_pass(DecomposeBatchNormNoStatsPass())
167171
self.add_pass(DecomposeVarPass())
168172
self.add_pass(
169173
DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)

0 commit comments

Comments
 (0)