Skip to content

Commit 9bb9b1f

Browse files
Merge branch 'main' into llm-runner-msvc
2 parents c68d12d + f81d768 commit 9bb9b1f

File tree

110 files changed

+4248
-2156
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+4248
-2156
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
44d8d54e38c0258357d4e92e1fefe21e845947a3
1+
467660923a5a25e4718e1d6697b93ff1bab4e807

.ci/docker/requirements-ci.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ sympy==1.12
66
timm==0.6.13
77
tomli==2.0.1
88
torchsr==1.0.4
9-
transformers==4.47.1
9+
transformers==4.56.1
1010
zstd==1.5.5.1
1111
pandas>=2.2.2; python_version >= '3.10'
1212
pytest==7.2.0

.ci/scripts/test_phi_3_mini.sh

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,34 +36,33 @@ cmake_build_phi_3_mini() {
3636
cmake --build ${BUILD_DIR}/${MODEL_DIR} -j${NPROC} --config ${BUILD_TYPE}
3737
}
3838

39-
# Download and convert tokenizer.model
39+
# Download tokenizer.model
4040
prepare_tokenizer() {
41-
echo "Downloading and converting tokenizer.model"
42-
wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true"
43-
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
41+
echo "Downloading tokenizer.model"
42+
wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/tokenizer.model?download=true"
4443
}
4544

4645
# Export phi-3-mini model to pte
4746
export_phi_3_mini () {
4847
echo "Exporting phi-3-mini. This will take a few minutes"
49-
$PYTHON_EXECUTABLE -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
48+
optimum-cli export executorch --model microsoft/Phi-3-mini-4k-instruct --task text-generation --recipe xnnpack --output_dir ./
5049
}
5150

5251
run_and_verify() {
5352
NOW=$(date +"%H:%M:%S")
5453
echo "Starting to run phi-3-mini runner at ${NOW}"
55-
if [[ ! -f "phi-3-mini.pte" ]]; then
56-
echo "Export failed. Abort"
54+
if [[ ! -f "model.pte" ]]; then
55+
echo "Missing model artifact. Abort"
5756
exit 1
5857
fi
59-
if [[ ! -f "tokenizer.bin" ]]; then
60-
echo "tokenizer.bin is missing."
58+
if [[ ! -f "tokenizer.model" ]]; then
59+
echo "tokenizer.model is missing."
6160
exit 1
6261
fi
6362

6463
${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
65-
--model_path=phi-3-mini.pte \
66-
--tokenizer_path=tokenizer.bin \
64+
--model_path=model.pte \
65+
--tokenizer_path=tokenizer.model \
6766
--seq_len=60 \
6867
--temperature=0 \
6968
--prompt="<|system|>
@@ -92,7 +91,7 @@ What is the capital of France?<|end|>
9291
cmake_install_executorch_libraries
9392
cmake_build_phi_3_mini
9493

95-
# Step 2. Export the tokenizer and model
94+
# Step 2. Export the model
9695
prepare_tokenizer
9796
export_phi_3_mini
9897

.github/workflows/cuda.yml

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,26 @@ jobs:
8888
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
8989
9090
export-voxtral-cuda-artifact:
91-
name: export-voxtral-cuda-artifact
91+
name: export-voxtral-cuda-${{ matrix.quant.name }}
9292
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
9393
permissions:
9494
id-token: write
9595
contents: read
9696
secrets: inherit
9797
strategy:
9898
fail-fast: false
99+
matrix:
100+
quant:
101+
- name: "non-quantized"
102+
artifact: "voxtral-cuda-export"
103+
extra_args: ""
104+
- name: "quantized-int4-tile-packed"
105+
artifact: "voxtral-cuda-quantized-int4-tile-packed"
106+
extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
107+
- name: "quantized-int4-weight-only"
108+
artifact: "voxtral-cuda-quantized-int4-weight-only"
109+
# TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
110+
extra_args: "--qlinear_encoder 4w"
99111
with:
100112
timeout: 90
101113
secrets-env: EXECUTORCH_HF_TOKEN
@@ -104,7 +116,7 @@ jobs:
104116
gpu-arch-version: 12.6
105117
use-custom-docker-registry: false
106118
submodules: recursive
107-
upload-artifact: voxtral-cuda-export
119+
upload-artifact: ${{ matrix.quant.artifact }}
108120
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
109121
script: |
110122
set -eux
@@ -122,14 +134,16 @@ jobs:
122134
pip list
123135
echo "::endgroup::"
124136
125-
echo "::group::Export Voxtral"
137+
echo "::group::Export Voxtral (${{ matrix.quant.name }})"
138+
EXTRA_ARGS="${{ matrix.quant.extra_args }}"
126139
optimum-cli export executorch \
127140
--model "mistralai/Voxtral-Mini-3B-2507" \
128141
--task "multimodal-text-to-text" \
129142
--recipe "cuda" \
130143
--dtype bfloat16 \
131144
--device cuda \
132145
--max_seq_len 1024 \
146+
${EXTRA_ARGS} \
133147
--output_dir ./
134148
python -m executorch.extension.audio.mel_spectrogram \
135149
--feature_size 128 \
@@ -142,7 +156,7 @@ jobs:
142156
test -f voxtral_preprocessor.pte
143157
echo "::endgroup::"
144158
145-
echo "::group::Store Voxtral Artifacts"
159+
echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }})"
146160
mkdir -p "${RUNNER_ARTIFACT_DIR}"
147161
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
148162
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -201,22 +215,30 @@ jobs:
201215
echo "::endgroup::"
202216
203217
test-voxtral-cuda-e2e:
204-
name: test-voxtral-cuda-e2e
218+
name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
205219
needs: export-voxtral-cuda-artifact
206220
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
207221
permissions:
208222
id-token: write
209223
contents: read
210224
strategy:
211225
fail-fast: false
226+
matrix:
227+
format:
228+
- name: "non-quantized"
229+
artifact: "voxtral-cuda-export"
230+
- name: "quantized-int4-tile-packed"
231+
artifact: "voxtral-cuda-quantized-int4-tile-packed"
232+
- name: "quantized-int4-weight-only"
233+
artifact: "voxtral-cuda-quantized-int4-weight-only"
212234
with:
213235
timeout: 90
214236
runner: linux.g5.4xlarge.nvidia.gpu
215237
gpu-arch-type: cuda
216238
gpu-arch-version: 12.6
217239
use-custom-docker-registry: false
218240
submodules: recursive
219-
download-artifact: voxtral-cuda-export
241+
download-artifact: ${{ matrix.format.artifact }}
220242
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
221243
script: |
222244
set -eux
@@ -226,7 +248,7 @@ jobs:
226248
pip list
227249
echo "::endgroup::"
228250
229-
echo "::group::Prepare Voxtral Artifacts"
251+
echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
230252
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
231253
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
232254
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
@@ -255,7 +277,7 @@ jobs:
255277
cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
256278
echo "::endgroup::"
257279
258-
echo "::group::Run Voxtral Runner"
280+
echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
259281
set +e
260282
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
261283
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \

.github/workflows/metal.yml

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
name: Test Metal Backend
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- main
8+
- release/*
9+
10+
concurrency:
11+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
12+
cancel-in-progress: false
13+
14+
jobs:
15+
test-metal-builds:
16+
name: test-executorch-metal-build
17+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
18+
with:
19+
runner: macos-m2-stable
20+
python-version: '3.11'
21+
submodules: 'recursive'
22+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
23+
timeout: 90
24+
script: |
25+
set -eux
26+
27+
echo "::group::Test ExecuTorch Metal build"
28+
PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_executorch.sh
29+
echo "::endgroup::"
30+
31+
export-voxtral-metal-artifact:
32+
name: export-voxtral-metal-artifact
33+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
34+
secrets: inherit
35+
with:
36+
runner: macos-m2-stable
37+
python-version: '3.11'
38+
submodules: 'recursive'
39+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
40+
timeout: 90
41+
secrets-env: EXECUTORCH_HF_TOKEN
42+
upload-artifact: voxtral-metal-export
43+
script: |
44+
set -eux
45+
46+
echo "::group::Setup Huggingface"
47+
${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
48+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
49+
echo "::endgroup::"
50+
51+
echo "::group::Setup Optimum-ExecuTorch"
52+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
53+
echo "Using optimum-executorch version: ${OPTIMUM_ET_VERSION}"
54+
${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
55+
${CONDA_RUN} pip install mistral-common librosa
56+
echo "::endgroup::"
57+
58+
echo "::group::Setup ExecuTorch"
59+
PYTHON_EXECUTABLE=python ${CONDA_RUN} ./install_executorch.sh
60+
echo "::endgroup::"
61+
62+
echo "::group::Pip List"
63+
${CONDA_RUN} pip list
64+
echo "::endgroup::"
65+
66+
echo "::group::Export Voxtral"
67+
${CONDA_RUN} optimum-cli export executorch \
68+
--model "mistralai/Voxtral-Mini-3B-2507" \
69+
--task "multimodal-text-to-text" \
70+
--recipe "metal" \
71+
--dtype bfloat16 \
72+
--max_seq_len 1024 \
73+
--output_dir ./
74+
${CONDA_RUN} python -m executorch.extension.audio.mel_spectrogram \
75+
--feature_size 128 \
76+
--stack_output \
77+
--max_audio_len 300 \
78+
--output_file voxtral_preprocessor.pte
79+
80+
test -f model.pte
81+
test -f aoti_metal_blob.ptd
82+
test -f voxtral_preprocessor.pte
83+
echo "::endgroup::"
84+
85+
echo "::group::Store Voxtral Artifacts"
86+
mkdir -p "${RUNNER_ARTIFACT_DIR}"
87+
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
88+
cp aoti_metal_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
89+
cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
90+
ls -al "${RUNNER_ARTIFACT_DIR}"
91+
echo "::endgroup::"
92+
93+
test-voxtral-metal-e2e:
94+
name: test-voxtral-metal-e2e
95+
needs: export-voxtral-metal-artifact
96+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
97+
with:
98+
runner: macos-m2-stable
99+
python-version: '3.11'
100+
submodules: 'recursive'
101+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
102+
timeout: 90
103+
download-artifact: voxtral-metal-export
104+
script: |
105+
set -eux
106+
107+
echo "::group::Print machine info"
108+
uname -a
109+
if [ $(uname -s) == Darwin ]; then
110+
sw_vers
111+
# Print RAM in GB
112+
RAM_BYTES=$(sysctl -n hw.memsize)
113+
RAM_GB=$(echo "scale=2; $RAM_BYTES/1024/1024/1024" | bc)
114+
echo "Available RAM (GB): $RAM_GB"
115+
sysctl machdep.cpu.brand_string
116+
sysctl machdep.cpu.core_count
117+
# Print number of GPU cores (Apple Silicon)
118+
if command -v system_profiler &> /dev/null; then
119+
GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Total Number of Cores/ {print $5; exit}')
120+
if [ -z "$GPU_CORES" ]; then
121+
# Fallback: try to parse "Core Count" from Apple GPU section
122+
GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Core Count/ {print $3; exit}')
123+
fi
124+
echo "GPU Cores: ${GPU_CORES:-Unknown}"
125+
else
126+
echo "system_profiler not available, cannot determine GPU cores."
127+
fi
128+
fi
129+
echo "::endgroup::"
130+
131+
echo "::group::Setup ExecuTorch Requirements"
132+
CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_requirements.sh
133+
echo "::endgroup::"
134+
135+
echo "::group::Pip List"
136+
${CONDA_RUN} pip list
137+
echo "::endgroup::"
138+
139+
echo "::group::Prepare Voxtral Artifacts"
140+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
141+
cp "${RUNNER_ARTIFACT_DIR}/aoti_metal_blob.ptd" .
142+
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
143+
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
144+
curl -L $TOKENIZER_URL -o tekken.json
145+
ls -al model.pte aoti_metal_blob.ptd voxtral_preprocessor.pte tekken.json
146+
echo "::endgroup::"
147+
148+
echo "::group::Create Test Audio File"
149+
say -o call_samantha_hall.aiff "Call Samantha Hall"
150+
afconvert -f WAVE -d LEI16 call_samantha_hall.aiff call_samantha_hall.wav
151+
echo "::endgroup::"
152+
153+
echo "::group::Build Voxtral Runner"
154+
${CONDA_RUN} cmake --preset llm \
155+
-DEXECUTORCH_BUILD_METAL=ON \
156+
-DCMAKE_INSTALL_PREFIX=cmake-out \
157+
-DCMAKE_BUILD_TYPE=Release \
158+
-Bcmake-out -S.
159+
${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 )) --target install --config Release
160+
161+
${CONDA_RUN} cmake -DEXECUTORCH_BUILD_METAL=ON \
162+
-DCMAKE_BUILD_TYPE=Release \
163+
-Sexamples/models/voxtral \
164+
-Bcmake-out/examples/models/voxtral/
165+
${CONDA_RUN} cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
166+
echo "::endgroup::"
167+
168+
echo "::group::Run Voxtral Runner"
169+
set +e
170+
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
171+
--model_path model.pte \
172+
--data_path aoti_metal_blob.ptd \
173+
--tokenizer_path tekken.json \
174+
--audio_path call_samantha_hall.wav \
175+
--processor_path voxtral_preprocessor.pte \
176+
--temperature 0 2>&1)
177+
EXIT_CODE=$?
178+
set -e
179+
180+
echo "$OUTPUT"
181+
182+
if ! echo "$OUTPUT" | grep -iq "Samantha"; then
183+
echo "Expected output 'Samantha' not found in output"
184+
exit 1
185+
fi
186+
187+
if [ $EXIT_CODE -ne 0 ]; then
188+
echo "Unexpected exit code: $EXIT_CODE"
189+
exit $EXIT_CODE
190+
fi
191+
echo "::endgroup::"

.github/workflows/pull.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -632,11 +632,14 @@ jobs:
632632
# The generic Linux job chooses to use base env, not the one setup by the image
633633
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
634634
conda activate "${CONDA_ENV}"
635-
635+
echo "::group::Setup ExecuTorch"
636636
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
637-
637+
echo "::endgroup::"
638+
639+
echo "::group::Setup requirements"
638640
# install phi-3-mini requirements
639641
bash examples/models/phi-3-mini/install_requirements.sh
642+
echo "::endgroup::"
640643
641644
# run e2e (export, tokenizer and runner)
642645
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh Release

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ xcuserdata/
6262
/include/
6363
/share/
6464
/version.py
65-
*.csv
6665
*_etdump
6766

6867
# Android

0 commit comments

Comments
 (0)