Skip to content

Commit 6b1d672

Browse files
Merge branch 'main' into tosa_dialect_conv2d
2 parents 4fd50ff + f32cdc3 commit 6b1d672

File tree

197 files changed

+9084
-1054
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

197 files changed

+9084
-1054
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
53a2908a10f414a2f85caa06703a26a40e873869
1+
cf9d09490c7f6685ec68d5db3acf2e0d73c54d00

.ci/scripts/setup-samsung-linux-deps.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ download_ai_lite_core() {
1313
API_BASE="https://soc-developer.semiconductor.samsung.com/api/v1/resource/ai-litecore/download"
1414
API_KEY=$SAMSUNG_AI_LITECORE_KEY
1515

16-
VERSION="0.5"
16+
VERSION="0.7"
1717
OS_NAME="Ubuntu 22.04"
1818
OUT_FILE="/tmp/exynos-ai-litecore-v${VERSION}.tar.gz"
1919
TARGET_PATH="/tmp/exynos_ai_lite_core"
@@ -62,7 +62,7 @@ install_enn_backend() {
6262
export PYTHONPATH=${PYTHONPATH:-}:${EXECUTORCH_ROOT}/..
6363
}
6464

65-
AI_LITE_CORE_VERSION=0.5.0
65+
AI_LITE_CORE_VERSION=0.7.0
6666

6767
download_ai_lite_core ${AI_LITE_CORE_VERSION}
6868
install_enn_backend

.ci/scripts/test_ios_ci.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ say() {
3636

3737
say "Cloning the Demo App"
3838

39+
git config --global http.postBuffer 524288000
3940
git clone --depth 1 https://github.com/meta-pytorch/executorch-examples.git
4041

4142
say "Installing CoreML Backend Requirements"

.ci/scripts/test_llava.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS=" \
3838
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
3939
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
4040
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
41+
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
4142
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
4243
-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
4344
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \

.github/workflows/cuda.yml

Lines changed: 130 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ jobs:
8787
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
8888
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
8989
90-
test-voxtral-cuda-e2e:
91-
name: test-voxtral-cuda-e2e
90+
export-voxtral-cuda-artifact:
91+
name: export-voxtral-cuda-artifact
9292
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
9393
permissions:
9494
id-token: write
@@ -104,6 +104,7 @@ jobs:
104104
gpu-arch-version: 12.6
105105
use-custom-docker-registry: false
106106
submodules: recursive
107+
upload-artifact: voxtral-cuda-export
107108
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
108109
script: |
109110
set -eux
@@ -118,6 +119,7 @@ jobs:
118119
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
119120
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
120121
pip install mistral-common librosa
122+
pip list
121123
echo "::endgroup::"
122124
123125
echo "::group::Export Voxtral"
@@ -129,9 +131,58 @@ jobs:
129131
--device cuda \
130132
--max_seq_len 1024 \
131133
--output_dir ./
134+
python -m executorch.extension.audio.mel_spectrogram \
135+
--feature_size 128 \
136+
--stack_output \
137+
--max_audio_len 300 \
138+
--output_file voxtral_preprocessor.pte
139+
140+
test -f model.pte
141+
test -f aoti_cuda_blob.ptd
142+
test -f voxtral_preprocessor.pte
132143
echo "::endgroup::"
133144
134-
echo "::group::Build Voxtral Runner"
145+
echo "::group::Store Voxtral Artifacts"
146+
mkdir -p "${RUNNER_ARTIFACT_DIR}"
147+
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
148+
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
149+
cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
150+
ls -al "${RUNNER_ARTIFACT_DIR}"
151+
echo "::endgroup::"
152+
153+
benchmark-voxtral-cuda:
154+
name: benchmark-voxtral-cuda
155+
needs: export-voxtral-cuda-artifact
156+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
157+
permissions:
158+
id-token: write
159+
contents: read
160+
strategy:
161+
fail-fast: false
162+
with:
163+
timeout: 90
164+
runner: linux.g5.4xlarge.nvidia.gpu
165+
gpu-arch-type: cuda
166+
gpu-arch-version: 12.6
167+
use-custom-docker-registry: false
168+
submodules: recursive
169+
download-artifact: voxtral-cuda-export
170+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
171+
script: |
172+
set -eux
173+
174+
echo "::group::Setup ExecuTorch Requirements"
175+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
176+
pip list
177+
echo "::endgroup::"
178+
179+
echo "::group::Prepare Voxtral Artifacts"
180+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
181+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
182+
ls -al model.pte aoti_cuda_blob.ptd
183+
echo "::endgroup::"
184+
185+
echo "::group::Build Voxtral Benchmark"
135186
cmake -DCMAKE_BUILD_TYPE=Release \
136187
-DEXECUTORCH_BUILD_CUDA=ON \
137188
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -142,31 +193,90 @@ jobs:
142193
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
143194
echo "::endgroup::"
144195
196+
echo "::group::Run Voxtral Benchmark"
197+
198+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
199+
cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
200+
201+
echo "::endgroup::"
202+
203+
test-voxtral-cuda-e2e:
204+
name: test-voxtral-cuda-e2e
205+
needs: export-voxtral-cuda-artifact
206+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
207+
permissions:
208+
id-token: write
209+
contents: read
210+
strategy:
211+
fail-fast: false
212+
with:
213+
timeout: 90
214+
runner: linux.g5.4xlarge.nvidia.gpu
215+
gpu-arch-type: cuda
216+
gpu-arch-version: 12.6
217+
use-custom-docker-registry: false
218+
submodules: recursive
219+
download-artifact: voxtral-cuda-export
220+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
221+
script: |
222+
set -eux
223+
224+
echo "::group::Setup ExecuTorch Requirements"
225+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
226+
pip list
227+
echo "::endgroup::"
228+
229+
echo "::group::Prepare Voxtral Artifacts"
230+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
231+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
232+
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
233+
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
234+
curl -L $TOKENIZER_URL -o tekken.json
235+
ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
236+
echo "::endgroup::"
237+
238+
echo "::group::Download Test Audio File"
239+
AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
240+
curl -L $AUDIO_URL -o poem.wav
241+
echo "::endgroup::"
242+
243+
echo "::group::Build Voxtral Runner"
244+
cmake --preset llm \
245+
-DEXECUTORCH_BUILD_CUDA=ON \
246+
-DCMAKE_INSTALL_PREFIX=cmake-out \
247+
-DCMAKE_BUILD_TYPE=Release \
248+
-Bcmake-out -S.
249+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
250+
251+
cmake -DEXECUTORCH_BUILD_CUDA=ON \
252+
-DCMAKE_BUILD_TYPE=Release \
253+
-Sexamples/models/voxtral \
254+
-Bcmake-out/examples/models/voxtral/
255+
cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
256+
echo "::endgroup::"
257+
145258
echo "::group::Run Voxtral Runner"
146-
# Capture output and allow exit code 139 if we have the expected printout
147259
set +e
148260
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
149-
OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
261+
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
262+
--model_path model.pte \
263+
--data_path aoti_cuda_blob.ptd \
264+
--tokenizer_path tekken.json \
265+
--audio_path poem.wav \
266+
--processor_path voxtral_preprocessor.pte \
267+
--temperature 0 2>&1)
150268
EXIT_CODE=$?
151269
set -e
152270
153271
echo "$OUTPUT"
154272
155-
# Check if the output contains "Run latency (ms):"
156-
if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
157-
echo "Found expected output: 'Run latency (ms):'"
158-
if [ $EXIT_CODE -eq 139 ]; then
159-
echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
160-
exit 0
161-
elif [ $EXIT_CODE -ne 0 ]; then
162-
echo "Unexpected exit code: $EXIT_CODE"
163-
exit $EXIT_CODE
164-
else
165-
echo "Command succeeded with exit code 0"
166-
exit 0
167-
fi
168-
else
169-
echo "Expected output 'Run latency (ms):' not found in output"
273+
if ! echo "$OUTPUT" | grep -iq "poem"; then
274+
echo "Expected output 'poem' not found in output"
170275
exit 1
171276
fi
277+
278+
if [ $EXIT_CODE -ne 0 ]; then
279+
echo "Unexpected exit code: $EXIT_CODE"
280+
exit $EXIT_CODE
281+
fi
172282
echo "::endgroup::"

.github/workflows/pull.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -935,6 +935,12 @@ jobs:
935935
python -m executorch.examples.samsung.aot_compiler --model_name=$model -c E9955
936936
done
937937
938+
# Test quant models
939+
model_scripts="deeplab_v3 edsr inception_v3 inception_v4 mobilenet_v2 mobilenet_v3 resnet18 resnet50 vit wav2letter"
940+
for m_script in $model_scripts; do
941+
python -m executorch.examples.samsung.scripts.${m_script} -c e9955 -p A8W8
942+
done
943+
938944
# Test ops
939945
python -m unittest discover -s backends/samsung/test/ops -p "test_*.py"
940946

.github/workflows/trunk.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ jobs:
346346
elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
347347
setup_script_args="--target-toolchain zephyr"
348348
toolchain_prefix=arm-zephyr-eabi-
349-
threshold="135168" # 132 KiB
349+
threshold="135240" # 132 KiB
350350
toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
351351
else
352352
echo "Fail unsupport OS selection ${{ matrix.os }}"

backends/aoti/common_shims.cpp

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,32 @@ AOTITorchError aoti_torch_get_storage_offset(
5151

5252
AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
5353
auto it = internal::tensor_to_strides.find(tensor);
54+
bool needs_update = false;
55+
5456
if (it == internal::tensor_to_strides.end()) {
57+
needs_update = true;
58+
} else {
59+
// CRITICAL: Multimodal models reuse tensors with different shapes across
60+
// executions (e.g., variable-length audio). We MUST validate cached
61+
// metadata matches current tensor state, or CUDA kernels will receive
62+
// incorrect shapes leading to memory corruption and segfaults.
63+
auto tensor_strides = tensor->strides();
64+
needs_update = !std::equal(
65+
it->second.begin(),
66+
it->second.end(),
67+
tensor_strides.begin(),
68+
tensor_strides.end());
69+
}
70+
71+
if (needs_update) {
5572
std::vector<int64_t> strides(tensor->dim());
5673
auto tensor_strides = tensor->strides();
5774
for (int i = 0; i < tensor->dim(); i++) {
5875
strides[i] = tensor_strides[i];
5976
}
60-
it = internal::tensor_to_strides.emplace(tensor, std::move(strides)).first;
77+
it =
78+
internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
79+
.first;
6180
}
6281

6382
// For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -80,13 +99,31 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
8099

81100
AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
82101
auto it = internal::tensor_to_sizes.find(tensor);
102+
bool needs_update = false;
103+
83104
if (it == internal::tensor_to_sizes.end()) {
105+
needs_update = true;
106+
} else {
107+
// CRITICAL: Multimodal models reuse tensors with different shapes across
108+
// executions (e.g., variable-length audio). We MUST validate cached
109+
// metadata matches current tensor state, or CUDA kernels will receive
110+
// incorrect shapes leading to memory corruption and segfaults.
111+
auto tensor_sizes = tensor->sizes();
112+
needs_update = !std::equal(
113+
it->second.begin(),
114+
it->second.end(),
115+
tensor_sizes.begin(),
116+
tensor_sizes.end());
117+
}
118+
119+
if (needs_update) {
84120
std::vector<int64_t> sizes(tensor->dim());
85121
auto tensor_sizes = tensor->sizes();
86122
for (int i = 0; i < tensor->dim(); i++) {
87123
sizes[i] = tensor_sizes[i];
88124
}
89-
it = internal::tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
125+
it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
126+
.first;
90127
}
91128

92129
// For 0D tensors, data() returns nullptr on empty vectors, but we need to

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from .convert_to_clamp import ConvertToClampPass # noqa
2727
from .decompose_acosh_pass import DecomposeAcoshPass # noqa
2828
from .decompose_adaptive_avg_pool2d_pass import DecomposeAdaptiveAvgPool2dPass # noqa
29+
from .decompose_add_sub_alpha_pass import DecomposeAddSubAlphaPass # noqa
2930
from .decompose_addmm_pass import DecomposeAddmmPass # noqa
3031
from .decompose_asin_and_acos_pass import DecomposeAsinAndAcosPass # noqa
3132
from .decompose_asinh_pass import DecomposeAsinhPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
DecomposeAcoshPass,
3636
DecomposeAdaptiveAvgPool2dPass,
3737
DecomposeAddmmPass,
38+
DecomposeAddSubAlphaPass,
3839
DecomposeAsinAndAcosPass,
3940
DecomposeAsinhPass,
4041
DecomposeAtanhPass,
@@ -262,6 +263,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
262263
)
263264
self.add_pass(DecomposeNotEqualPass())
264265
self.add_pass(DecomposeDivPass())
266+
self.add_pass(DecomposeAddSubAlphaPass())
265267
self.add_pass(DecomposeSoftmaxPass())
266268
self.add_pass(DecomposeGeluPass())
267269
self.add_pass(ConvertFullLikeToFullPass())
@@ -334,6 +336,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
334336
self.add_pass(DecomposeSignPass())
335337
self.add_pass(DecomposeAddmmPass())
336338
self.add_pass(DecomposeDivTensorModePass())
339+
self.add_pass(DecomposeAddSubAlphaPass())
337340
self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
338341
self.add_pass(ScalarsToAttributePass())
339342
self.add_pass(DecomposeGroupNormPass())

0 commit comments

Comments
 (0)