Skip to content

Commit a4cf57a

Browse files
committed
Update base for Update on "gemma3 e2e runner on cuda"
This diff introduces e2e runner for gemma3 model on cuda delegating using AOTI library, which is guarded by CI. Also other necessary infrastructure updates for building and running the `gemma3 e2e runner` on CUDA devices. Differential Revision: [D85087532](https://our.internmc.facebook.com/intern/diff/D85087532/) [ghstack-poisoned]
2 parents a9ac599 + 9b744b4 commit a4cf57a

File tree

74 files changed

+2731
-1851
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+2731
-1851
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
09fdbd0a0639b128f712a4f5202ed42ca4c60957
1+
467660923a5a25e4718e1d6697b93ff1bab4e807

.ci/docker/requirements-ci.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ sympy==1.12
66
timm==0.6.13
77
tomli==2.0.1
88
torchsr==1.0.4
9-
transformers==4.47.1
9+
transformers==4.56.1
1010
zstd==1.5.5.1
1111
pandas>=2.2.2; python_version >= '3.10'
1212
pytest==7.2.0

.ci/scripts/test_phi_3_mini.sh

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,34 +36,33 @@ cmake_build_phi_3_mini() {
3636
cmake --build ${BUILD_DIR}/${MODEL_DIR} -j${NPROC} --config ${BUILD_TYPE}
3737
}
3838

39-
# Download and convert tokenizer.model
39+
# Download tokenizer.model
4040
prepare_tokenizer() {
41-
echo "Downloading and converting tokenizer.model"
42-
wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true"
43-
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
41+
echo "Downloading tokenizer.model"
42+
wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/tokenizer.model?download=true"
4443
}
4544

4645
# Export phi-3-mini model to pte
4746
export_phi_3_mini () {
4847
echo "Exporting phi-3-mini. This will take a few minutes"
49-
$PYTHON_EXECUTABLE -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
48+
optimum-cli export executorch --model microsoft/Phi-3-mini-4k-instruct --task text-generation --recipe xnnpack --output_dir ./
5049
}
5150

5251
run_and_verify() {
5352
NOW=$(date +"%H:%M:%S")
5453
echo "Starting to run phi-3-mini runner at ${NOW}"
55-
if [[ ! -f "phi-3-mini.pte" ]]; then
56-
echo "Export failed. Abort"
54+
if [[ ! -f "model.pte" ]]; then
55+
echo "Missing model artifact. Abort"
5756
exit 1
5857
fi
59-
if [[ ! -f "tokenizer.bin" ]]; then
60-
echo "tokenizer.bin is missing."
58+
if [[ ! -f "tokenizer.model" ]]; then
59+
echo "tokenizer.model is missing."
6160
exit 1
6261
fi
6362

6463
${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
65-
--model_path=phi-3-mini.pte \
66-
--tokenizer_path=tokenizer.bin \
64+
--model_path=model.pte \
65+
--tokenizer_path=tokenizer.model \
6766
--seq_len=60 \
6867
--temperature=0 \
6968
--prompt="<|system|>
@@ -92,7 +91,7 @@ What is the capital of France?<|end|>
9291
cmake_install_executorch_libraries
9392
cmake_build_phi_3_mini
9493

95-
# Step 2. Export the tokenizer and model
94+
# Step 2. Export the model
9695
prepare_tokenizer
9796
export_phi_3_mini
9897

.github/workflows/metal.yml

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
name: Test Metal Backend
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- main
8+
- release/*
9+
10+
concurrency:
11+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
12+
cancel-in-progress: false
13+
14+
jobs:
15+
test-metal-builds:
16+
name: test-executorch-metal-build
17+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
18+
with:
19+
runner: macos-m2-stable
20+
python-version: '3.11'
21+
submodules: 'recursive'
22+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
23+
timeout: 90
24+
script: |
25+
set -eux
26+
27+
echo "::group::Test ExecuTorch Metal build"
28+
PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_executorch.sh
29+
echo "::endgroup::"
30+
31+
export-voxtral-metal-artifact:
32+
name: export-voxtral-metal-artifact
33+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
34+
secrets: inherit
35+
with:
36+
runner: macos-m2-stable
37+
python-version: '3.11'
38+
submodules: 'recursive'
39+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
40+
timeout: 90
41+
secrets-env: EXECUTORCH_HF_TOKEN
42+
upload-artifact: voxtral-metal-export
43+
script: |
44+
set -eux
45+
46+
echo "::group::Setup Huggingface"
47+
${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
48+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
49+
echo "::endgroup::"
50+
51+
echo "::group::Setup Optimum-ExecuTorch"
52+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
53+
echo "Using optimum-executorch version: ${OPTIMUM_ET_VERSION}"
54+
${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
55+
${CONDA_RUN} pip install mistral-common librosa
56+
echo "::endgroup::"
57+
58+
echo "::group::Setup ExecuTorch"
59+
PYTHON_EXECUTABLE=python ${CONDA_RUN} ./install_executorch.sh
60+
echo "::endgroup::"
61+
62+
echo "::group::Pip List"
63+
${CONDA_RUN} pip list
64+
echo "::endgroup::"
65+
66+
echo "::group::Export Voxtral"
67+
${CONDA_RUN} optimum-cli export executorch \
68+
--model "mistralai/Voxtral-Mini-3B-2507" \
69+
--task "multimodal-text-to-text" \
70+
--recipe "metal" \
71+
--dtype bfloat16 \
72+
--max_seq_len 1024 \
73+
--output_dir ./
74+
${CONDA_RUN} python -m executorch.extension.audio.mel_spectrogram \
75+
--feature_size 128 \
76+
--stack_output \
77+
--max_audio_len 300 \
78+
--output_file voxtral_preprocessor.pte
79+
80+
test -f model.pte
81+
test -f aoti_metal_blob.ptd
82+
test -f voxtral_preprocessor.pte
83+
echo "::endgroup::"
84+
85+
echo "::group::Store Voxtral Artifacts"
86+
mkdir -p "${RUNNER_ARTIFACT_DIR}"
87+
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
88+
cp aoti_metal_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
89+
cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
90+
ls -al "${RUNNER_ARTIFACT_DIR}"
91+
echo "::endgroup::"
92+
93+
test-voxtral-metal-e2e:
94+
name: test-voxtral-metal-e2e
95+
needs: export-voxtral-metal-artifact
96+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
97+
with:
98+
runner: macos-m2-stable
99+
python-version: '3.11'
100+
submodules: 'recursive'
101+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
102+
timeout: 90
103+
download-artifact: voxtral-metal-export
104+
script: |
105+
set -eux
106+
107+
echo "::group::Print machine info"
108+
uname -a
109+
if [ $(uname -s) == Darwin ]; then
110+
sw_vers
111+
# Print RAM in GB
112+
RAM_BYTES=$(sysctl -n hw.memsize)
113+
RAM_GB=$(echo "scale=2; $RAM_BYTES/1024/1024/1024" | bc)
114+
echo "Available RAM (GB): $RAM_GB"
115+
sysctl machdep.cpu.brand_string
116+
sysctl machdep.cpu.core_count
117+
# Print number of GPU cores (Apple Silicon)
118+
if command -v system_profiler &> /dev/null; then
119+
GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Total Number of Cores/ {print $5; exit}')
120+
if [ -z "$GPU_CORES" ]; then
121+
# Fallback: try to parse "Core Count" from Apple GPU section
122+
GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Core Count/ {print $3; exit}')
123+
fi
124+
echo "GPU Cores: ${GPU_CORES:-Unknown}"
125+
else
126+
echo "system_profiler not available, cannot determine GPU cores."
127+
fi
128+
fi
129+
echo "::endgroup::"
130+
131+
echo "::group::Setup ExecuTorch Requirements"
132+
CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_requirements.sh
133+
echo "::endgroup::"
134+
135+
echo "::group::Pip List"
136+
${CONDA_RUN} pip list
137+
echo "::endgroup::"
138+
139+
echo "::group::Prepare Voxtral Artifacts"
140+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
141+
cp "${RUNNER_ARTIFACT_DIR}/aoti_metal_blob.ptd" .
142+
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
143+
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
144+
curl -L $TOKENIZER_URL -o tekken.json
145+
ls -al model.pte aoti_metal_blob.ptd voxtral_preprocessor.pte tekken.json
146+
echo "::endgroup::"
147+
148+
echo "::group::Create Test Audio File"
149+
say -o call_samantha_hall.aiff "Call Samantha Hall"
150+
afconvert -f WAVE -d LEI16 call_samantha_hall.aiff call_samantha_hall.wav
151+
echo "::endgroup::"
152+
153+
echo "::group::Build Voxtral Runner"
154+
${CONDA_RUN} cmake --preset llm \
155+
-DEXECUTORCH_BUILD_METAL=ON \
156+
-DCMAKE_INSTALL_PREFIX=cmake-out \
157+
-DCMAKE_BUILD_TYPE=Release \
158+
-Bcmake-out -S.
159+
${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 )) --target install --config Release
160+
161+
${CONDA_RUN} cmake -DEXECUTORCH_BUILD_METAL=ON \
162+
-DCMAKE_BUILD_TYPE=Release \
163+
-Sexamples/models/voxtral \
164+
-Bcmake-out/examples/models/voxtral/
165+
${CONDA_RUN} cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
166+
echo "::endgroup::"
167+
168+
echo "::group::Run Voxtral Runner"
169+
set +e
170+
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
171+
--model_path model.pte \
172+
--data_path aoti_metal_blob.ptd \
173+
--tokenizer_path tekken.json \
174+
--audio_path call_samantha_hall.wav \
175+
--processor_path voxtral_preprocessor.pte \
176+
--temperature 0 2>&1)
177+
EXIT_CODE=$?
178+
set -e
179+
180+
echo "$OUTPUT"
181+
182+
if ! echo "$OUTPUT" | grep -iq "Samantha"; then
183+
echo "Expected output 'Samantha' not found in output"
184+
exit 1
185+
fi
186+
187+
if [ $EXIT_CODE -ne 0 ]; then
188+
echo "Unexpected exit code: $EXIT_CODE"
189+
exit $EXIT_CODE
190+
fi
191+
echo "::endgroup::"

.github/workflows/pull.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -632,11 +632,14 @@ jobs:
632632
# The generic Linux job chooses to use base env, not the one setup by the image
633633
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
634634
conda activate "${CONDA_ENV}"
635-
635+
echo "::group::Setup ExecuTorch"
636636
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
637-
637+
echo "::endgroup::"
638+
639+
echo "::group::Setup requirements"
638640
# install phi-3-mini requirements
639641
bash examples/models/phi-3-mini/install_requirements.sh
642+
echo "::endgroup::"
640643
641644
# run e2e (export, tokenizer and runner)
642645
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh Release

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ xcuserdata/
6262
/include/
6363
/share/
6464
/version.py
65-
*.csv
6665
*_etdump
6766

6867
# Android

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ executorch
3434
│ ├── <a href="backends/qualcomm">qualcomm</a> - Qualcomm-specific backends. See <a href="docs/source/backends-qualcomm.md">doc</a>.
3535
│ ├── <a href="backends/transforms">transforms</a> - Transformations for backend optimization.
3636
│ ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends-vulkan.md">doc</a>.
37-
│ └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends-xnnpack.md">doc</a>.
37+
│ └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends/xnnpack/xnnpack-overview.md">doc</a>.
3838
├── <a href="codegen">codegen</a> - Tooling to autogenerate bindings between kernels and the runtime.
3939
├── <a href="configurations">configurations</a> - Configuration files.
4040
├── <a href="devtools">devtools</a> - Model profiling, debugging, and inspection. Please refer to the <a href="docs/source/devtools-overview.md">tools documentation</a> for more information.

README-wheel.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ The `executorch` pip package is in beta.
1111
The prebuilt `executorch.runtime` module included in this package provides a way
1212
to run ExecuTorch `.pte` files, with some restrictions:
1313
* Only [core ATen operators](docs/source/ir-ops-set-definition.md) are linked into the prebuilt module
14-
* Only the [XNNPACK backend delegate](docs/source/backends-xnnpack.md) is linked into the prebuilt module.
14+
* Only the [XNNPACK backend delegate](docs/source/backends/xnnpack/xnnpack-overview.md) is linked into the prebuilt module.
1515
* \[macOS only] [Core ML](docs/source/backends/coreml/coreml-overview.md) and [MPS](docs/source/backends/mps/mps-overview.md) backend
1616
are also linked into the prebuilt module.
1717

backends/arm/quantizer/quantization_annotator.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class _QuantProperty:
3737
"""Specify how the input/output at 'index' must be quantized."""
3838

3939
index: int
40-
qspec: type[QuantizationSpecBase] | List[type[QuantizationSpecBase]]
40+
qspec: QuantizationSpecBase | List[QuantizationSpecBase]
4141
optional: bool = False
4242
mark_annotated: bool = False
4343

@@ -515,24 +515,24 @@ def any_or_hardtanh_min_zero(n: Node):
515515
_QuantProperty(0, input_act_qspec),
516516
_QuantProperty(
517517
1,
518-
input_act_qspec if node.args[0] == node.args[1] else shared_qspec, # type: ignore[arg-type]
518+
input_act_qspec if node.args[0] == node.args[1] else shared_qspec,
519519
),
520520
]
521-
quant_properties.quant_output = _QuantProperty(0, shared_qspec) # type: ignore[arg-type]
521+
quant_properties.quant_output = _QuantProperty(0, shared_qspec)
522522
elif node.target in (torch.ops.aten.where.self,):
523523
shared_qspec = SharedQuantizationSpec(node.args[1]) # type: ignore[arg-type]
524524
quant_properties.quant_inputs = [
525-
_QuantProperty(1, shared_qspec), # type: ignore[arg-type]
526-
_QuantProperty(2, shared_qspec), # type: ignore[arg-type]
525+
_QuantProperty(1, shared_qspec),
526+
_QuantProperty(2, shared_qspec),
527527
]
528-
quant_properties.quant_output = _QuantProperty(0, shared_qspec) # type: ignore[arg-type]
528+
quant_properties.quant_output = _QuantProperty(0, shared_qspec)
529529
elif node.target in _one_to_one_shared_input_or_input_act_qspec:
530530
input_qspec = (
531531
SharedQuantizationSpec(node.args[0]) # type: ignore[arg-type]
532-
if is_output_annotated(node.args[0]) # type: ignore
532+
if is_output_annotated(node.args[0]) # type: ignore[arg-type]
533533
else input_act_qspec
534534
)
535-
quant_properties.quant_inputs = [_QuantProperty(0, input_qspec)] # type: ignore[arg-type]
535+
quant_properties.quant_inputs = [_QuantProperty(0, input_qspec)]
536536
quant_properties.quant_output = _QuantProperty(
537537
0,
538538
SharedQuantizationSpec((node.args[0], node)), # type: ignore[arg-type]
@@ -551,7 +551,7 @@ def any_or_hardtanh_min_zero(n: Node):
551551
if len(node.args[0]) == 0:
552552
raise ValueError("Expected non-empty list for node.args[0]")
553553

554-
shared_qspec = SharedQuantizationSpec((node.args[0][0], node))
554+
shared_qspec = SharedQuantizationSpec((node.args[0][0], node)) # type: ignore[arg-type]
555555
quant_properties.quant_inputs = [
556556
_QuantProperty(
557557
0,
@@ -561,7 +561,7 @@ def any_or_hardtanh_min_zero(n: Node):
561561
],
562562
)
563563
]
564-
quant_properties.quant_output = _QuantProperty(0, shared_qspec) # type: ignore[arg-type]
564+
quant_properties.quant_output = _QuantProperty(0, shared_qspec)
565565
elif node.target in _one_to_one:
566566
quant_properties.quant_inputs = [_QuantProperty(0, input_act_qspec)]
567567
quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
@@ -583,7 +583,7 @@ def any_or_hardtanh_min_zero(n: Node):
583583
_QuantProperty(0, input_act_qspec),
584584
_QuantProperty(
585585
1,
586-
input_act_qspec if node.args[0] == node.args[1] else shared_qspec, # type: ignore[arg-type]
586+
input_act_qspec if node.args[0] == node.args[1] else shared_qspec,
587587
),
588588
]
589589
quant_properties.quant_output = None
@@ -596,11 +596,11 @@ def any_or_hardtanh_min_zero(n: Node):
596596
quant_properties.quant_inputs = []
597597
quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
598598
elif node.target in [operator.getitem]:
599-
if not is_output_annotated(node.args[0]): # type: ignore[attr-defined, arg-type]
599+
if not is_output_annotated(node.args[0]): # type: ignore[arg-type]
600600
return None
601601
shared_qspec = SharedQuantizationSpec(node.args[0]) # type: ignore[arg-type]
602-
quant_properties.quant_inputs = [_QuantProperty(0, shared_qspec)] # type: ignore[arg-type]
603-
quant_properties.quant_output = _QuantProperty(0, shared_qspec) # type: ignore[arg-type]
602+
quant_properties.quant_inputs = [_QuantProperty(0, shared_qspec)]
603+
quant_properties.quant_output = _QuantProperty(0, shared_qspec)
604604
else:
605605
return None
606606

0 commit comments

Comments
 (0)