Skip to content

Commit e05b531

Browse files
Merge branch 'main' into op-floor-div
2 parents 0abfba1 + ef21739 commit e05b531

File tree

168 files changed

+9901
-2417
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

168 files changed

+9901
-2417
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
44d8d54e38c0258357d4e92e1fefe21e845947a3
1+
e8f76b4295584c4328e7fd7971c131cb341c7438
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
53a2908a10f414a2f85caa06703a26a40e873869
1+
e6f766c7d750d40603eee3f66c5915bac606b3ea

.ci/docker/requirements-ci.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ sympy==1.12
66
timm==0.6.13
77
tomli==2.0.1
88
torchsr==1.0.4
9-
transformers==4.47.1
9+
transformers==4.56.1
1010
zstd==1.5.5.1
1111
pandas>=2.2.2; python_version >= '3.10'
1212
pytest==7.2.0

.ci/scripts/test_phi_3_mini.sh

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,34 +36,33 @@ cmake_build_phi_3_mini() {
3636
cmake --build ${BUILD_DIR}/${MODEL_DIR} -j${NPROC} --config ${BUILD_TYPE}
3737
}
3838

39-
# Download and convert tokenizer.model
39+
# Download tokenizer.model
4040
prepare_tokenizer() {
41-
echo "Downloading and converting tokenizer.model"
42-
wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true"
43-
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
41+
echo "Downloading tokenizer.model"
42+
wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/tokenizer.model?download=true"
4443
}
4544

4645
# Export phi-3-mini model to pte
4746
export_phi_3_mini () {
4847
echo "Exporting phi-3-mini. This will take a few minutes"
49-
$PYTHON_EXECUTABLE -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
48+
optimum-cli export executorch --model microsoft/Phi-3-mini-4k-instruct --task text-generation --recipe xnnpack --output_dir ./
5049
}
5150

5251
run_and_verify() {
5352
NOW=$(date +"%H:%M:%S")
5453
echo "Starting to run phi-3-mini runner at ${NOW}"
55-
if [[ ! -f "phi-3-mini.pte" ]]; then
56-
echo "Export failed. Abort"
54+
if [[ ! -f "model.pte" ]]; then
55+
echo "Missing model artifact. Abort"
5756
exit 1
5857
fi
59-
if [[ ! -f "tokenizer.bin" ]]; then
60-
echo "tokenizer.bin is missing."
58+
if [[ ! -f "tokenizer.model" ]]; then
59+
echo "tokenizer.model is missing."
6160
exit 1
6261
fi
6362

6463
${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
65-
--model_path=phi-3-mini.pte \
66-
--tokenizer_path=tokenizer.bin \
64+
--model_path=model.pte \
65+
--tokenizer_path=tokenizer.model \
6766
--seq_len=60 \
6867
--temperature=0 \
6968
--prompt="<|system|>
@@ -92,7 +91,7 @@ What is the capital of France?<|end|>
9291
cmake_install_executorch_libraries
9392
cmake_build_phi_3_mini
9493

95-
# Step 2. Export the tokenizer and model
94+
# Step 2. Export the model
9695
prepare_tokenizer
9796
export_phi_3_mini
9897

.ci/scripts/utils.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,44 @@ install_pip_dependencies() {
4444
popd || return
4545
}
4646

47+
dedupe_macos_loader_path_rpaths() {
48+
if [[ "$(uname)" != "Darwin" ]]; then
49+
return
50+
fi
51+
52+
local torch_lib_dir
53+
pushd ..
54+
torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
55+
popd
56+
57+
if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
58+
return
59+
fi
60+
61+
local torch_libs=(
62+
"libtorch_cpu.dylib"
63+
"libtorch.dylib"
64+
"libc10.dylib"
65+
)
66+
67+
for lib_name in "${torch_libs[@]}"; do
68+
local lib_path="${torch_lib_dir}/${lib_name}"
69+
if [[ ! -f "${lib_path}" ]]; then
70+
continue
71+
fi
72+
73+
local removed=0
74+
# Repeatedly remove the @loader_path rpath entries until none remain.
75+
while install_name_tool -delete_rpath @loader_path "${lib_path}" 2>/dev/null; do
76+
removed=1
77+
done
78+
79+
if [[ "${removed}" == "1" ]]; then
80+
install_name_tool -add_rpath @loader_path "${lib_path}" || true
81+
fi
82+
done
83+
}
84+
4785
install_domains() {
4886
echo "Install torchvision and torchaudio"
4987
pip install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${TORCHAUDIO_VERSION}"
@@ -101,6 +139,7 @@ install_pytorch_and_domains() {
101139
echo "Use cached wheel at ${cached_torch_wheel}"
102140
fi
103141

142+
dedupe_macos_loader_path_rpaths
104143
# Grab the pinned audio and vision commits from PyTorch
105144
TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
106145
export TORCHAUDIO_VERSION

.github/workflows/cuda.yml

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,26 @@ jobs:
8888
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
8989
9090
export-voxtral-cuda-artifact:
91-
name: export-voxtral-cuda-artifact
91+
name: export-voxtral-cuda-${{ matrix.quant.name }}
9292
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
9393
permissions:
9494
id-token: write
9595
contents: read
9696
secrets: inherit
9797
strategy:
9898
fail-fast: false
99+
matrix:
100+
quant:
101+
- name: "non-quantized"
102+
artifact: "voxtral-cuda-export"
103+
extra_args: ""
104+
- name: "quantized-int4-tile-packed"
105+
artifact: "voxtral-cuda-quantized-int4-tile-packed"
106+
extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
107+
- name: "quantized-int4-weight-only"
108+
artifact: "voxtral-cuda-quantized-int4-weight-only"
109+
# TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
110+
extra_args: "--qlinear_encoder 4w"
99111
with:
100112
timeout: 90
101113
secrets-env: EXECUTORCH_HF_TOKEN
@@ -104,7 +116,7 @@ jobs:
104116
gpu-arch-version: 12.6
105117
use-custom-docker-registry: false
106118
submodules: recursive
107-
upload-artifact: voxtral-cuda-export
119+
upload-artifact: ${{ matrix.quant.artifact }}
108120
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
109121
script: |
110122
set -eux
@@ -122,14 +134,16 @@ jobs:
122134
pip list
123135
echo "::endgroup::"
124136
125-
echo "::group::Export Voxtral"
137+
echo "::group::Export Voxtral (${{ matrix.quant.name }})"
138+
EXTRA_ARGS="${{ matrix.quant.extra_args }}"
126139
optimum-cli export executorch \
127140
--model "mistralai/Voxtral-Mini-3B-2507" \
128141
--task "multimodal-text-to-text" \
129142
--recipe "cuda" \
130143
--dtype bfloat16 \
131144
--device cuda \
132145
--max_seq_len 1024 \
146+
${EXTRA_ARGS} \
133147
--output_dir ./
134148
python -m executorch.extension.audio.mel_spectrogram \
135149
--feature_size 128 \
@@ -142,7 +156,7 @@ jobs:
142156
test -f voxtral_preprocessor.pte
143157
echo "::endgroup::"
144158
145-
echo "::group::Store Voxtral Artifacts"
159+
echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }})"
146160
mkdir -p "${RUNNER_ARTIFACT_DIR}"
147161
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
148162
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -201,22 +215,30 @@ jobs:
201215
echo "::endgroup::"
202216
203217
test-voxtral-cuda-e2e:
204-
name: test-voxtral-cuda-e2e
218+
name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
205219
needs: export-voxtral-cuda-artifact
206220
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
207221
permissions:
208222
id-token: write
209223
contents: read
210224
strategy:
211225
fail-fast: false
226+
matrix:
227+
format:
228+
- name: "non-quantized"
229+
artifact: "voxtral-cuda-export"
230+
- name: "quantized-int4-tile-packed"
231+
artifact: "voxtral-cuda-quantized-int4-tile-packed"
232+
- name: "quantized-int4-weight-only"
233+
artifact: "voxtral-cuda-quantized-int4-weight-only"
212234
with:
213235
timeout: 90
214236
runner: linux.g5.4xlarge.nvidia.gpu
215237
gpu-arch-type: cuda
216238
gpu-arch-version: 12.6
217239
use-custom-docker-registry: false
218240
submodules: recursive
219-
download-artifact: voxtral-cuda-export
241+
download-artifact: ${{ matrix.format.artifact }}
220242
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
221243
script: |
222244
set -eux
@@ -226,7 +248,7 @@ jobs:
226248
pip list
227249
echo "::endgroup::"
228250
229-
echo "::group::Prepare Voxtral Artifacts"
251+
echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
230252
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
231253
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
232254
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
@@ -255,7 +277,7 @@ jobs:
255277
cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
256278
echo "::endgroup::"
257279
258-
echo "::group::Run Voxtral Runner"
280+
echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
259281
set +e
260282
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
261283
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \

.github/workflows/pull.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ jobs:
351351
352352
# reinstall executorch
353353
bash ./install_executorch.sh --minimal
354+
pip list
354355
355356
# run python unittest
356357
python -m unittest examples.models.moshi.mimi.test_mimi
@@ -631,11 +632,14 @@ jobs:
631632
# The generic Linux job chooses to use base env, not the one setup by the image
632633
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
633634
conda activate "${CONDA_ENV}"
634-
635+
echo "::group::Setup ExecuTorch"
635636
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
636-
637+
echo "::endgroup::"
638+
639+
echo "::group::Setup requirements"
637640
# install phi-3-mini requirements
638641
bash examples/models/phi-3-mini/install_requirements.sh
642+
echo "::endgroup::"
639643
640644
# run e2e (export, tokenizer and runner)
641645
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh Release

.github/workflows/trunk.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ jobs:
290290
- test_arm_baremetal: test_models_ethos-u85
291291
- test_arm_baremetal: test_smaller_stories_llama
292292
- test_arm_baremetal: test_memory_allocation
293+
- test_arm_baremetal: test_model_smollm2-135M
293294
fail-fast: false
294295
with:
295296
runner: linux.2xlarge.memory

CMakeLists.txt

Lines changed: 16 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -99,28 +99,6 @@ announce_configured_options(CCACHE_PROGRAM)
9999

100100
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
101101

102-
# Setup RPATH. See
103-
# https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
104-
# Use separate rpaths during build and install phases
105-
set(CMAKE_SKIP_BUILD_RPATH OFF)
106-
# Don't use the install-rpath during the build phase
107-
set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
108-
# Automatically add all linked folders that are NOT in the build directory to
109-
# the rpath (per library?)
110-
#
111-
# TODO: Doesn't work for us right now because we are not installing .so's into
112-
# the correct locations. For example we have libcustom_ops_aot_lib.so depending
113-
# on _portable_lib.so, which was eventually put under
114-
# <site-packages>/executorch/extension/pybindings/ but this rpath is not
115-
# automatically added because at build time it seems `portable_lib` is being
116-
# built under the same directory, so no extra rpath is being added. To properly
117-
# fix this we need to install `portable_lib` into the correct path.
118-
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
119-
# ------------------------------ OPTIONS -------------------------------------
120-
# WARNING: Please don't add example specific options in this CMakeLists.txt.
121-
# Instead please use `find_package(executorch REQUIRED)` in the example
122-
# directory and add a new executable in the example `CMakeLists.txt`.
123-
124102
if(NOT EXECUTORCH_ENABLE_LOGGING)
125103
# Avoid pulling in the logging strings, which can be large. Note that this
126104
# will set the compiler flag for all targets in this directory, and for all
@@ -605,15 +583,23 @@ if(EXECUTORCH_BUILD_CORTEX_M)
605583
list(APPEND _executorch_backends coretex_m_backend)
606584
endif()
607585

608-
if(EXECUTORCH_BUILD_CUDA)
609-
# Build common AOTI functionality (required for CUDA)
586+
# Build common AOTI functionality if needed by CUDA or Metal backends
587+
if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL)
610588
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
589+
endif()
590+
591+
if(EXECUTORCH_BUILD_CUDA)
611592
# Build CUDA-specific AOTI functionality
612593
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
613594
# Add aoti_cuda to backends - it already depends on aoti_common
614595
list(APPEND _executorch_backends aoti_cuda)
615596
endif()
616597

598+
if(EXECUTORCH_BUILD_METAL)
599+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/metal)
600+
list(APPEND _executorch_backends metal_backend)
601+
endif()
602+
617603
if(EXECUTORCH_BUILD_EXTENSION_APPLE)
618604
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
619605
endif()
@@ -901,12 +887,13 @@ if(EXECUTORCH_BUILD_PYBIND)
901887

902888
# Set RPATH to find PyTorch libraries relative to the installation location
903889
# This goes from executorch/extension/pybindings up to site-packages, then to
904-
# torch/lib
890+
# torch/lib. Don't do this to APPLE, as it will error out on the following
891+
# error:
892+
#
905893
if(APPLE)
906-
set_target_properties(
907-
portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
908-
INSTALL_RPATH "@loader_path/../../../torch/lib"
909-
)
894+
# Skip setting @loader_path for APPLE, since it causes error like ld:
895+
# duplicate LC_RPATH '@loader_path' in '<site-packages>/torch/lib/
896+
# libtorch_cpu.dylib'
910897
else()
911898
set_target_properties(
912899
portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"

CONTRIBUTING.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ For Apple, please refer to the [iOS documentation](docs/source/using-executorch-
2424
executorch
2525
├── <a href="backends">backends</a> - Backend delegate implementations for various hardware targets. Each backend uses partitioner to split the graph into subgraphs that can be executed on specific hardware, quantizer to optimize model precision, and runtime components to execute the graph on target hardware. For details refer to the <a href="docs/source/backend-delegates-integration.md">backend documentation</a> and the <a href="docs/source/using-executorch-export.md">Export and Lowering tutorial</a> for more information.
2626
│ ├── <a href="backends/apple">apple</a> - Apple-specific backends.
27-
│ │ ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends-coreml.md">doc</a>.
28-
│ │ └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends-mps.md">doc</a>.
27+
│ │ ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends/coreml/coreml-overview.md">doc</a>.
28+
│ │ └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends/mps/mps-overview.md">doc</a>.
2929
│ ├── <a href="backends/arm">arm</a> - ARM architecture backends. See <a href="docs/source/backends-arm-ethos-u.md">doc</a>.
3030
│ ├── <a href="backends/cadence">cadence</a> - Cadence-specific backends. See <a href="docs/source/backends-cadence.md">doc</a>.
3131
│ ├── <a href="backends/example">example</a> - Example backend implementations.

0 commit comments

Comments
 (0)