Skip to content

Commit e5fe33d

Browse files
Merge branch 'main' into deprecate_internal_models
2 parents b2c9786 + fdfeaa4 commit e5fe33d

File tree

222 files changed

+5044
-1572
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

222 files changed

+5044
-1572
lines changed

.github/workflows/cuda.yml

Lines changed: 207 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,75 @@ jobs:
164164
ls -al "${RUNNER_ARTIFACT_DIR}"
165165
echo "::endgroup::"
166166
167+
export-gemma3-cuda-artifact:
168+
name: export-gemma3-cuda-${{ matrix.quant.name }}
169+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
170+
permissions:
171+
id-token: write
172+
contents: read
173+
secrets: inherit
174+
strategy:
175+
fail-fast: false
176+
matrix:
177+
quant:
178+
- name: "non-quantized"
179+
artifact: "gemma3-cuda-export"
180+
extra_args: ""
181+
- name: "quantized-int4-tile-packed"
182+
artifact: "gemma3-cuda-quantized-int4-tile-packed"
183+
extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
184+
# TODO: enable int4-weight-only on gemma3.
185+
# - name: "quantized-int4-weight-only"
186+
# artifact: "voxtral-cuda-quantized-int4-weight-only"
187+
# # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
188+
# extra_args: "--qlinear_encoder 4w"
189+
with:
190+
timeout: 90
191+
secrets-env: EXECUTORCH_HF_TOKEN
192+
runner: linux.g5.4xlarge.nvidia.gpu
193+
gpu-arch-type: cuda
194+
gpu-arch-version: 12.6
195+
use-custom-docker-registry: false
196+
submodules: recursive
197+
upload-artifact: ${{ matrix.quant.artifact }}
198+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
199+
script: |
200+
set -eux
201+
202+
echo "::group::Setup ExecuTorch"
203+
./install_executorch.sh
204+
echo "::endgroup::"
205+
206+
echo "::group::Setup Huggingface"
207+
pip install -U "huggingface_hub[cli]" accelerate
208+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
209+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
210+
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
211+
pip list
212+
echo "::endgroup::"
213+
214+
echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
215+
EXTRA_ARGS="${{ matrix.quant.extra_args }}"
216+
optimum-cli export executorch \
217+
--model "google/gemma-3-4b-it" \
218+
--task "multimodal-text-to-text" \
219+
--recipe "cuda" \
220+
--dtype bfloat16 \
221+
--device cuda \
222+
--max_seq_len 64 \
223+
--output_dir ./
224+
225+
test -f model.pte
226+
test -f aoti_cuda_blob.ptd
227+
echo "::endgroup::"
228+
229+
echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
230+
mkdir -p "${RUNNER_ARTIFACT_DIR}/"
231+
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
232+
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
233+
ls -al "${RUNNER_ARTIFACT_DIR}/"
234+
echo "::endgroup::"
235+
167236
benchmark-voxtral-cuda:
168237
name: benchmark-voxtral-cuda
169238
needs: export-voxtral-cuda-artifact
@@ -186,7 +255,7 @@ jobs:
186255
set -eux
187256
188257
echo "::group::Setup ExecuTorch Requirements"
189-
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
258+
./install_requirements.sh
190259
pip list
191260
echo "::endgroup::"
192261
@@ -204,13 +273,63 @@ jobs:
204273
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
205274
-DEXECUTORCH_BUILD_TESTS=ON \
206275
-Bcmake-out .
207-
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
276+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
208277
echo "::endgroup::"
209278
210279
echo "::group::Run Voxtral Benchmark"
211280
212281
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
213-
cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
282+
cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
283+
284+
echo "::endgroup::"
285+
286+
benchmark-gemma3-cuda:
287+
name: benchmark-gemma3-cuda
288+
needs: export-gemma3-cuda-artifact
289+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
290+
permissions:
291+
id-token: write
292+
contents: read
293+
strategy:
294+
fail-fast: false
295+
with:
296+
timeout: 90
297+
runner: linux.g5.4xlarge.nvidia.gpu
298+
gpu-arch-type: cuda
299+
gpu-arch-version: 12.6
300+
use-custom-docker-registry: false
301+
submodules: recursive
302+
download-artifact: gemma3-cuda-export
303+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
304+
script: |
305+
set -eux
306+
307+
echo "::group::Setup ExecuTorch Requirements"
308+
./install_requirements.sh
309+
pip list
310+
echo "::endgroup::"
311+
312+
echo "::group::Prepare Gemma3 Artifacts"
313+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
314+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
315+
ls -al model.pte aoti_cuda_blob.ptd
316+
echo "::endgroup::"
317+
318+
echo "::group::Build Gemma3 Benchmark"
319+
cmake -DCMAKE_BUILD_TYPE=Release \
320+
-DEXECUTORCH_BUILD_CUDA=ON \
321+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
322+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
323+
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
324+
-DEXECUTORCH_BUILD_TESTS=ON \
325+
-Bcmake-out .
326+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
327+
echo "::endgroup::"
328+
329+
echo "::group::Run Gemma3 Benchmark"
330+
331+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
332+
cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
214333
215334
echo "::endgroup::"
216335
@@ -244,7 +363,7 @@ jobs:
244363
set -eux
245364
246365
echo "::group::Setup ExecuTorch Requirements"
247-
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
366+
./install_requirements.sh
248367
pip list
249368
echo "::endgroup::"
250369
@@ -302,3 +421,87 @@ jobs:
302421
exit $EXIT_CODE
303422
fi
304423
echo "::endgroup::"
424+
425+
test-gemma3-cuda-e2e:
426+
name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
427+
needs: export-gemma3-cuda-artifact
428+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
429+
permissions:
430+
id-token: write
431+
contents: read
432+
strategy:
433+
fail-fast: false
434+
matrix:
435+
format:
436+
- name: "non-quantized"
437+
artifact: "gemma3-cuda-export"
438+
- name: "quantized-int4-tile-packed"
439+
artifact: "gemma3-cuda-quantized-int4-tile-packed"
440+
# TODO: enable int4-weight-only on gemma3.
441+
# - name: "quantized-int4-weight-only"
442+
# artifact: "gemma3-cuda-quantized-int4-weight-only"
443+
with:
444+
timeout: 90
445+
runner: linux.g5.4xlarge.nvidia.gpu
446+
gpu-arch-type: cuda
447+
gpu-arch-version: 12.6
448+
use-custom-docker-registry: false
449+
submodules: recursive
450+
download-artifact: ${{ matrix.format.artifact }}
451+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
452+
script: |
453+
set -eux
454+
455+
echo "::group::Setup ExecuTorch Requirements"
456+
./install_requirements.sh
457+
pip list
458+
echo "::endgroup::"
459+
460+
echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
461+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
462+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
463+
TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
464+
curl -L $TOKENIZER_URL -o tokenizer.json
465+
ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
466+
IMAGE_PATH="docs/source/_static/img/et-logo.png"
467+
echo "::endgroup::"
468+
469+
echo "::group::Build Gemma3 Runner"
470+
cmake --preset llm \
471+
-DEXECUTORCH_BUILD_CUDA=ON \
472+
-DCMAKE_INSTALL_PREFIX=cmake-out \
473+
-DCMAKE_BUILD_TYPE=Release \
474+
-Bcmake-out -S.
475+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
476+
477+
cmake -DEXECUTORCH_BUILD_CUDA=ON \
478+
-DCMAKE_BUILD_TYPE=Release \
479+
-Sexamples/models/gemma3 \
480+
-Bcmake-out/examples/models/gemma3/
481+
cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
482+
echo "::endgroup::"
483+
484+
echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
485+
set +e
486+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
487+
OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
488+
--model_path model.pte \
489+
--data_path aoti_cuda_blob.ptd \
490+
--tokenizer_path tokenizer.json \
491+
--image_path $IMAGE_PATH \
492+
--temperature 0 2>&1)
493+
EXIT_CODE=$?
494+
set -e
495+
496+
echo "$OUTPUT"
497+
498+
if ! echo "$OUTPUT" | grep -iq "chip"; then
499+
echo "Expected output 'chip' not found in output"
500+
exit 1
501+
fi
502+
503+
if [ $EXIT_CODE -ne 0 ]; then
504+
echo "Unexpected exit code: $EXIT_CODE"
505+
exit $EXIT_CODE
506+
fi
507+
echo "::endgroup::"

.mypy.ini

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,12 @@ ignore_missing_imports = True
8383
[mypy-tosa_tools.*]
8484
ignore_missing_imports = True
8585

86+
[mypy-tosa_serializer]
87+
ignore_missing_imports = True
88+
89+
[mypy-tosa_serializer.*]
90+
ignore_missing_imports = True
91+
8692
[mypy-setuptools.*]
8793
ignore_missing_imports = True
8894

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ executorch
3333
│ ├── <a href="backends/openvino">openvino</a> - OpenVINO backend for Intel hardware.
3434
│ ├── <a href="backends/qualcomm">qualcomm</a> - Qualcomm-specific backends. See <a href="docs/source/backends-qualcomm.md">doc</a>.
3535
│ ├── <a href="backends/transforms">transforms</a> - Transformations for backend optimization.
36-
│ ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends-vulkan.md">doc</a>.
36+
│ ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends/vulkan/vulkan-overview.md">doc</a>.
3737
│ └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends/xnnpack/xnnpack-overview.md">doc</a>.
3838
├── <a href="codegen">codegen</a> - Tooling to autogenerate bindings between kernels and the runtime.
3939
├── <a href="configurations">configurations</a> - Configuration files.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ ExecuTorch powers on-device AI at scale across Meta's family of apps, VR/AR devi
202202

203203
**LLMs:** [Llama 3.2/3.1/3](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [LiquidAI LFM2](examples/models/lfm2/README.md)
204204

205-
**Multimodal:** [Llava](examples/models/llava/README.md) (vision-language), [Voxtral](examples/models/voxtral/README.md) (audio-language)
205+
**Multimodal:** [Llava](examples/models/llava/README.md) (vision-language), [Voxtral](examples/models/voxtral/README.md) (audio-language), [Gemma](examples/models/gemma3) (vision-language)
206206

207207
**Vision/Speech:** [MobileNetV2](https://github.com/meta-pytorch/executorch-examples/tree/main/mv2), [DeepLabV3](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3), [Whisper](https://github.com/meta-pytorch/executorch-examples/tree/main/whisper/android/WhisperApp)
208208

backends/apple/metal/metal_backend.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929

3030
# exist fallback operators in et namespace;
3131
supported_fallback_kernels: Dict[str, Any] = {
32-
"aoti_torch_mps_addmm_out": None,
3332
"aoti_torch_mps_convolution": None,
3433
"aoti_torch_mps_mm_out": None,
3534
"at::_ops::_scaled_dot_product_attention_math_for_mps::call": None,
@@ -108,34 +107,62 @@ def preprocess(
108107
options: dict[str, typing.Any] = {
109108
# Do not link against the full PyTorch/libtorch library
110109
"aot_inductor.link_libtorch": False,
111-
# Package model constants and other generated files directly in the shared object (.so) file
112-
"aot_inductor.package_constants_in_so": True,
110+
# Separate weight constants from the .so file
111+
"aot_inductor.package": True,
112+
"aot_inductor.package_constants_in_so": False,
113+
# Store weight constants on disk in a binary blob
114+
"aot_inductor.package_constants_on_disk_format": "binary_blob",
113115
# Enable maximum automatic tuning for optimal performance
114116
"max_autotune": True,
115117
# "aot_inductor.debug_compile": True,
116118
# "aot_inductor.force_mmap_weights": False,
117119
}
118120

119121
with collect_unsupported_fallback_kernels():
120-
so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type]
122+
paths = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options) # type: ignore[arg-type]
121123
if len(missing_fallback_kernels) > 0:
122124
formatted_kernels = "\n - ".join(sorted(missing_fallback_kernels))
123125
raise RuntimeError(
124126
f"Missing fallback kernels ({len(missing_fallback_kernels)} total):\n - {formatted_kernels}\n"
125127
"Please add them to the AOTI backend."
126128
)
127129

130+
# Extract the .so and .blob paths from the returned list
131+
so_path = None
132+
blob_path = None
133+
for path in paths:
134+
if path.endswith(".wrapper.so"):
135+
so_path = path
136+
elif path.endswith(".wrapper_weights.blob"):
137+
blob_path = path
138+
139+
if so_path is None or blob_path is None:
140+
raise RuntimeError(
141+
f"Could not find required files in compiled paths, got {paths}"
142+
)
143+
128144
# pyre-ignorep[6]: Incompatible parameter type
129145
with open(so_path, "rb") as f:
130146
so_data = f.read()
131147

132148
named_data_store = NamedDataStore()
133149
method_name = MetalBackend.method_name_from_compile_specs(compile_specs)
150+
151+
# Keep the so file in the NamedDataStore, so that it can be packaged into the .pte file.
152+
named_data_store.add_named_data(method_name + "_so_blob", so_data, 1, None)
153+
154+
# Add weights blob to named data store
155+
with open(blob_path, "rb") as f:
156+
blob_data = f.read()
157+
134158
named_data_store.add_named_data(
135-
method_name + "_so_blob", so_data, 1, "aoti_metal_blob"
159+
method_name + "_weights_blob", blob_data, 1, "aoti_metal_blob"
136160
)
137161

138-
# Clean up the generated so file; it has been packaged into the NamdeDataStore
162+
# Clean up the weights blob file
163+
os.remove(blob_path)
164+
165+
# Clean up the generated so file; it has been packaged into the NamedDataStore
139166
# pyre-ignorep[6]: Incompatible parameter type
140167
os.remove(so_path)
141168

0 commit comments

Comments
 (0)