Skip to content

Commit 55b19c3

Browse files
authored
Merge branch 'main' into export-D84284541
2 parents 600c480 + d00279d commit 55b19c3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1551
-383
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
53a2908a10f414a2f85caa06703a26a40e873869
1+
cf9d09490c7f6685ec68d5db3acf2e0d73c54d00

.github/workflows/cuda.yml

Lines changed: 130 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ jobs:
8787
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
8888
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
8989
90-
test-voxtral-cuda-e2e:
91-
name: test-voxtral-cuda-e2e
90+
export-voxtral-cuda-artifact:
91+
name: export-voxtral-cuda-artifact
9292
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
9393
permissions:
9494
id-token: write
@@ -104,6 +104,7 @@ jobs:
104104
gpu-arch-version: 12.6
105105
use-custom-docker-registry: false
106106
submodules: recursive
107+
upload-artifact: voxtral-cuda-export
107108
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
108109
script: |
109110
set -eux
@@ -118,6 +119,7 @@ jobs:
118119
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
119120
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
120121
pip install mistral-common librosa
122+
pip list
121123
echo "::endgroup::"
122124
123125
echo "::group::Export Voxtral"
@@ -129,9 +131,58 @@ jobs:
129131
--device cuda \
130132
--max_seq_len 1024 \
131133
--output_dir ./
134+
python -m executorch.extension.audio.mel_spectrogram \
135+
--feature_size 128 \
136+
--stack_output \
137+
--max_audio_len 300 \
138+
--output_file voxtral_preprocessor.pte
139+
140+
test -f model.pte
141+
test -f aoti_cuda_blob.ptd
142+
test -f voxtral_preprocessor.pte
132143
echo "::endgroup::"
133144
134-
echo "::group::Build Voxtral Runner"
145+
echo "::group::Store Voxtral Artifacts"
146+
mkdir -p "${RUNNER_ARTIFACT_DIR}"
147+
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
148+
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
149+
cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
150+
ls -al "${RUNNER_ARTIFACT_DIR}"
151+
echo "::endgroup::"
152+
153+
benchmark-voxtral-cuda:
154+
name: benchmark-voxtral-cuda
155+
needs: export-voxtral-cuda-artifact
156+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
157+
permissions:
158+
id-token: write
159+
contents: read
160+
strategy:
161+
fail-fast: false
162+
with:
163+
timeout: 90
164+
runner: linux.g5.4xlarge.nvidia.gpu
165+
gpu-arch-type: cuda
166+
gpu-arch-version: 12.6
167+
use-custom-docker-registry: false
168+
submodules: recursive
169+
download-artifact: voxtral-cuda-export
170+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
171+
script: |
172+
set -eux
173+
174+
echo "::group::Setup ExecuTorch Requirements"
175+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
176+
pip list
177+
echo "::endgroup::"
178+
179+
echo "::group::Prepare Voxtral Artifacts"
180+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
181+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
182+
ls -al model.pte aoti_cuda_blob.ptd
183+
echo "::endgroup::"
184+
185+
echo "::group::Build Voxtral Benchmark"
135186
cmake -DCMAKE_BUILD_TYPE=Release \
136187
-DEXECUTORCH_BUILD_CUDA=ON \
137188
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -142,31 +193,90 @@ jobs:
142193
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
143194
echo "::endgroup::"
144195
196+
echo "::group::Run Voxtral Benchmark"
197+
198+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
199+
cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
200+
201+
echo "::endgroup::"
202+
203+
test-voxtral-cuda-e2e:
204+
name: test-voxtral-cuda-e2e
205+
needs: export-voxtral-cuda-artifact
206+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
207+
permissions:
208+
id-token: write
209+
contents: read
210+
strategy:
211+
fail-fast: false
212+
with:
213+
timeout: 90
214+
runner: linux.g5.4xlarge.nvidia.gpu
215+
gpu-arch-type: cuda
216+
gpu-arch-version: 12.6
217+
use-custom-docker-registry: false
218+
submodules: recursive
219+
download-artifact: voxtral-cuda-export
220+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
221+
script: |
222+
set -eux
223+
224+
echo "::group::Setup ExecuTorch Requirements"
225+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
226+
pip list
227+
echo "::endgroup::"
228+
229+
echo "::group::Prepare Voxtral Artifacts"
230+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
231+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
232+
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
233+
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
234+
curl -L $TOKENIZER_URL -o tekken.json
235+
ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
236+
echo "::endgroup::"
237+
238+
echo "::group::Download Test Audio File"
239+
AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
240+
curl -L $AUDIO_URL -o poem.wav
241+
echo "::endgroup::"
242+
243+
echo "::group::Build Voxtral Runner"
244+
cmake --preset llm \
245+
-DEXECUTORCH_BUILD_CUDA=ON \
246+
-DCMAKE_INSTALL_PREFIX=cmake-out \
247+
-DCMAKE_BUILD_TYPE=Release \
248+
-Bcmake-out -S.
249+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
250+
251+
cmake -DEXECUTORCH_BUILD_CUDA=ON \
252+
-DCMAKE_BUILD_TYPE=Release \
253+
-Sexamples/models/voxtral \
254+
-Bcmake-out/examples/models/voxtral/
255+
cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
256+
echo "::endgroup::"
257+
145258
echo "::group::Run Voxtral Runner"
146-
# Capture output and allow exit code 139 if we have the expected printout
147259
set +e
148260
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
149-
OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
261+
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
262+
--model_path model.pte \
263+
--data_path aoti_cuda_blob.ptd \
264+
--tokenizer_path tekken.json \
265+
--audio_path poem.wav \
266+
--processor_path voxtral_preprocessor.pte \
267+
--temperature 0 2>&1)
150268
EXIT_CODE=$?
151269
set -e
152270
153271
echo "$OUTPUT"
154272
155-
# Check if the output contains "Run latency (ms):"
156-
if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
157-
echo "Found expected output: 'Run latency (ms):'"
158-
if [ $EXIT_CODE -eq 139 ]; then
159-
echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
160-
exit 0
161-
elif [ $EXIT_CODE -ne 0 ]; then
162-
echo "Unexpected exit code: $EXIT_CODE"
163-
exit $EXIT_CODE
164-
else
165-
echo "Command succeeded with exit code 0"
166-
exit 0
167-
fi
168-
else
169-
echo "Expected output 'Run latency (ms):' not found in output"
273+
if ! echo "$OUTPUT" | grep -iq "poem"; then
274+
echo "Expected output 'poem' not found in output"
170275
exit 1
171276
fi
277+
278+
if [ $EXIT_CODE -ne 0 ]; then
279+
echo "Unexpected exit code: $EXIT_CODE"
280+
exit $EXIT_CODE
281+
fi
172282
echo "::endgroup::"

backends/aoti/common_shims.cpp

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,32 @@ AOTITorchError aoti_torch_get_storage_offset(
5151

5252
AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
5353
auto it = internal::tensor_to_strides.find(tensor);
54+
bool needs_update = false;
55+
5456
if (it == internal::tensor_to_strides.end()) {
57+
needs_update = true;
58+
} else {
59+
// CRITICAL: Multimodal models reuse tensors with different shapes across
60+
// executions (e.g., variable-length audio). We MUST validate cached
61+
// metadata matches current tensor state, or CUDA kernels will receive
62+
// incorrect shapes leading to memory corruption and segfaults.
63+
auto tensor_strides = tensor->strides();
64+
needs_update = !std::equal(
65+
it->second.begin(),
66+
it->second.end(),
67+
tensor_strides.begin(),
68+
tensor_strides.end());
69+
}
70+
71+
if (needs_update) {
5572
std::vector<int64_t> strides(tensor->dim());
5673
auto tensor_strides = tensor->strides();
5774
for (int i = 0; i < tensor->dim(); i++) {
5875
strides[i] = tensor_strides[i];
5976
}
60-
it = internal::tensor_to_strides.emplace(tensor, std::move(strides)).first;
77+
it =
78+
internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
79+
.first;
6180
}
6281

6382
// For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -80,13 +99,31 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
8099

81100
AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
82101
auto it = internal::tensor_to_sizes.find(tensor);
102+
bool needs_update = false;
103+
83104
if (it == internal::tensor_to_sizes.end()) {
105+
needs_update = true;
106+
} else {
107+
// CRITICAL: Multimodal models reuse tensors with different shapes across
108+
// executions (e.g., variable-length audio). We MUST validate cached
109+
// metadata matches current tensor state, or CUDA kernels will receive
110+
// incorrect shapes leading to memory corruption and segfaults.
111+
auto tensor_sizes = tensor->sizes();
112+
needs_update = !std::equal(
113+
it->second.begin(),
114+
it->second.end(),
115+
tensor_sizes.begin(),
116+
tensor_sizes.end());
117+
}
118+
119+
if (needs_update) {
84120
std::vector<int64_t> sizes(tensor->dim());
85121
auto tensor_sizes = tensor->sizes();
86122
for (int i = 0; i < tensor->dim(); i++) {
87123
sizes[i] = tensor_sizes[i];
88124
}
89-
it = internal::tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
125+
it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
126+
.first;
90127
}
91128

92129
// For 0D tensors, data() returns nullptr on empty vectors, but we need to

backends/arm/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ PyTorch models to a TOSA representation. This representation is used to
66
deploy to the following targets:
77

88
- **Arm&reg; Ethos&trade;-U55/65/85** - Compiled using the Ethos-U Vela compiler.
9-
- **VGF (Vulkan&reg; Graph Format)** – SPIR-V™ representation for Vulkan-capable devices.
9+
- **VGF Format, for ML extensions for Vulkan®**a format containing SPIR-V™ ML operators for Vulkan-capable devices.
1010

1111
The backend provides an ahead-of-time (AOT) flow, that produces a PTE file for your
1212
chosen target. The AOT flow supports the following development operating systems:

backends/arm/requirements-arm-ethos-u.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6-
ethos-u-vela == 4.4.0
6+
ethos-u-vela == 4.4.1

backends/arm/scripts/mlsdk_utils.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ function setup_mlsdk() {
131131
-DSPIRV_TOOLS_PATH=../../dependencies/SPIRV-Tools \
132132
-DVULKAN_HEADERS_PATH=../../dependencies/Vulkan-Headers
133133

134-
cmake --build build
134+
cmake --build build -j$(nproc)
135135
cmake --install build --prefix deploy
136136
popd
137137
fi

backends/arm/test/ops/test_add.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
from typing import cast, Tuple
99

10-
import pytest
1110
import torch
1211
from executorch.backends.arm.quantizer import arm_quantizer
1312
from executorch.backends.arm.quantizer.arm_quantizer import (
@@ -260,9 +259,6 @@ def get_symmetric_a16w8_add_quantizer(per_channel_quantization=False):
260259

261260

262261
@common.parametrize("test_data", Add.test_data)
263-
@pytest.mark.xfail(
264-
reason="missing int16 add ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13730"
265-
)
266262
def test_add_tensor_16a8w_tosa_INT(test_data: input_t1):
267263
"""Test add operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
268264
per_channel_quantization = False

backends/arm/test/ops/test_cat.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from typing import Tuple
1010

11-
import pytest
1211
import torch
1312
from executorch.backends.arm.quantizer.arm_quantizer import (
1413
get_symmetric_a16w8_quantization_config,
@@ -178,9 +177,6 @@ def get_symmetric_a16w8_cat_quantizer(per_channel_quantization=False):
178177

179178

180179
@common.parametrize("test_data", Cat.test_parameters)
181-
@pytest.mark.xfail(
182-
reason="missing int16 cat ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13978"
183-
)
184180
def test_cat_16a8w_tosa_INT(test_data: Tuple):
185181
"""Test cat operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
186182
per_channel_quantization = False
@@ -206,9 +202,6 @@ def test_cat_16a8w_tosa_INT(test_data: Tuple):
206202

207203
@common.parametrize("test_data", Cat.test_parameters)
208204
@common.XfailIfNoCorstone300
209-
@pytest.mark.xfail(
210-
reason="Vela compilation fails with 'Invalid arguments' for int16 cat operations"
211-
)
212205
def test_cat_16a8w_u55_INT16(test_data: Tuple):
213206
"""Test cat operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
214207
per_channel_quantization = False
@@ -233,9 +226,6 @@ def test_cat_16a8w_u55_INT16(test_data: Tuple):
233226

234227
@common.parametrize("test_data", Cat.test_parameters)
235228
@common.XfailIfNoCorstone320
236-
@pytest.mark.xfail(
237-
reason="Vela compilation fails with 'Invalid arguments' for int16 cat operations"
238-
)
239229
def test_cat_16a8w_u85_INT16(test_data: Tuple):
240230
"""Test cat operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
241231
per_channel_quantization = False

backends/arm/test/ops/test_mul.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from typing import Tuple
1010

11-
import pytest
1211
import torch
1312
from executorch.backends.arm.quantizer.arm_quantizer import (
1413
get_symmetric_a16w8_quantization_config,
@@ -310,9 +309,6 @@ def get_symmetric_a16w8_mul_quantizer(per_channel_quantization=False):
310309

311310

312311
@common.parametrize("test_data", test_data_suite)
313-
@pytest.mark.xfail(
314-
reason="missing int16 mul ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13947"
315-
)
316312
def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1):
317313
"""Test mul operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
318314
per_channel_quantization = False

backends/arm/test/ops/test_multihead_attention.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6-
import pytest
76
import torch
87
from executorch.backends.arm.test import common
98
from executorch.backends.arm.test.tester.test_pipeline import (
@@ -69,7 +68,6 @@ def test_multihead_attention_tosa_INT(test_data):
6968
"test_data",
7069
test_suite,
7170
)
72-
@pytest.mark.xfail(reason="MLETORCH-1102: Numerical issues on FVP")
7371
@common.XfailIfNoCorstone300
7472
def test_multihead_attention_u55_INT(test_data: input_t1):
7573
test_data, module = test_data()
@@ -90,7 +88,6 @@ def test_multihead_attention_u55_INT(test_data: input_t1):
9088
"test_data",
9189
test_suite,
9290
)
93-
@pytest.mark.xfail(reason="MLETORCH-1102: Numerical issues on FVP")
9491
@common.XfailIfNoCorstone320
9592
def test_multihead_attention_u85_INT(test_data: input_t1):
9693
test_data, module = test_data()

0 commit comments

Comments
 (0)