Skip to content

Commit 71a079d

Browse files
Update
[ghstack-poisoned]
2 parents 5dfcd4f + bcd7655 commit 71a079d

File tree

95 files changed

+1793
-667
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+1793
-667
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
53a2908a10f414a2f85caa06703a26a40e873869
1+
cf9d09490c7f6685ec68d5db3acf2e0d73c54d00

.ci/scripts/test_llava.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS=" \
3838
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
3939
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
4040
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
41+
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
4142
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
4243
-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
4344
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \

.github/workflows/cuda.yml

Lines changed: 130 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ jobs:
8787
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
8888
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
8989
90-
test-voxtral-cuda-e2e:
91-
name: test-voxtral-cuda-e2e
90+
export-voxtral-cuda-artifact:
91+
name: export-voxtral-cuda-artifact
9292
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
9393
permissions:
9494
id-token: write
@@ -104,6 +104,7 @@ jobs:
104104
gpu-arch-version: 12.6
105105
use-custom-docker-registry: false
106106
submodules: recursive
107+
upload-artifact: voxtral-cuda-export
107108
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
108109
script: |
109110
set -eux
@@ -118,6 +119,7 @@ jobs:
118119
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
119120
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
120121
pip install mistral-common librosa
122+
pip list
121123
echo "::endgroup::"
122124
123125
echo "::group::Export Voxtral"
@@ -129,9 +131,58 @@ jobs:
129131
--device cuda \
130132
--max_seq_len 1024 \
131133
--output_dir ./
134+
python -m executorch.extension.audio.mel_spectrogram \
135+
--feature_size 128 \
136+
--stack_output \
137+
--max_audio_len 300 \
138+
--output_file voxtral_preprocessor.pte
139+
140+
test -f model.pte
141+
test -f aoti_cuda_blob.ptd
142+
test -f voxtral_preprocessor.pte
132143
echo "::endgroup::"
133144
134-
echo "::group::Build Voxtral Runner"
145+
echo "::group::Store Voxtral Artifacts"
146+
mkdir -p "${RUNNER_ARTIFACT_DIR}"
147+
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
148+
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
149+
cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
150+
ls -al "${RUNNER_ARTIFACT_DIR}"
151+
echo "::endgroup::"
152+
153+
benchmark-voxtral-cuda:
154+
name: benchmark-voxtral-cuda
155+
needs: export-voxtral-cuda-artifact
156+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
157+
permissions:
158+
id-token: write
159+
contents: read
160+
strategy:
161+
fail-fast: false
162+
with:
163+
timeout: 90
164+
runner: linux.g5.4xlarge.nvidia.gpu
165+
gpu-arch-type: cuda
166+
gpu-arch-version: 12.6
167+
use-custom-docker-registry: false
168+
submodules: recursive
169+
download-artifact: voxtral-cuda-export
170+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
171+
script: |
172+
set -eux
173+
174+
echo "::group::Setup ExecuTorch Requirements"
175+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
176+
pip list
177+
echo "::endgroup::"
178+
179+
echo "::group::Prepare Voxtral Artifacts"
180+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
181+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
182+
ls -al model.pte aoti_cuda_blob.ptd
183+
echo "::endgroup::"
184+
185+
echo "::group::Build Voxtral Benchmark"
135186
cmake -DCMAKE_BUILD_TYPE=Release \
136187
-DEXECUTORCH_BUILD_CUDA=ON \
137188
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -142,31 +193,90 @@ jobs:
142193
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
143194
echo "::endgroup::"
144195
196+
echo "::group::Run Voxtral Benchmark"
197+
198+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
199+
cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
200+
201+
echo "::endgroup::"
202+
203+
test-voxtral-cuda-e2e:
204+
name: test-voxtral-cuda-e2e
205+
needs: export-voxtral-cuda-artifact
206+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
207+
permissions:
208+
id-token: write
209+
contents: read
210+
strategy:
211+
fail-fast: false
212+
with:
213+
timeout: 90
214+
runner: linux.g5.4xlarge.nvidia.gpu
215+
gpu-arch-type: cuda
216+
gpu-arch-version: 12.6
217+
use-custom-docker-registry: false
218+
submodules: recursive
219+
download-artifact: voxtral-cuda-export
220+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
221+
script: |
222+
set -eux
223+
224+
echo "::group::Setup ExecuTorch Requirements"
225+
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
226+
pip list
227+
echo "::endgroup::"
228+
229+
echo "::group::Prepare Voxtral Artifacts"
230+
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
231+
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
232+
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
233+
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
234+
curl -L $TOKENIZER_URL -o tekken.json
235+
ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
236+
echo "::endgroup::"
237+
238+
echo "::group::Download Test Audio File"
239+
AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
240+
curl -L $AUDIO_URL -o poem.wav
241+
echo "::endgroup::"
242+
243+
echo "::group::Build Voxtral Runner"
244+
cmake --preset llm \
245+
-DEXECUTORCH_BUILD_CUDA=ON \
246+
-DCMAKE_INSTALL_PREFIX=cmake-out \
247+
-DCMAKE_BUILD_TYPE=Release \
248+
-Bcmake-out -S.
249+
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
250+
251+
cmake -DEXECUTORCH_BUILD_CUDA=ON \
252+
-DCMAKE_BUILD_TYPE=Release \
253+
-Sexamples/models/voxtral \
254+
-Bcmake-out/examples/models/voxtral/
255+
cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
256+
echo "::endgroup::"
257+
145258
echo "::group::Run Voxtral Runner"
146-
# Capture output and allow exit code 139 if we have the expected printout
147259
set +e
148260
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
149-
OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
261+
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
262+
--model_path model.pte \
263+
--data_path aoti_cuda_blob.ptd \
264+
--tokenizer_path tekken.json \
265+
--audio_path poem.wav \
266+
--processor_path voxtral_preprocessor.pte \
267+
--temperature 0 2>&1)
150268
EXIT_CODE=$?
151269
set -e
152270
153271
echo "$OUTPUT"
154272
155-
# Check if the output contains "Run latency (ms):"
156-
if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
157-
echo "Found expected output: 'Run latency (ms):'"
158-
if [ $EXIT_CODE -eq 139 ]; then
159-
echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
160-
exit 0
161-
elif [ $EXIT_CODE -ne 0 ]; then
162-
echo "Unexpected exit code: $EXIT_CODE"
163-
exit $EXIT_CODE
164-
else
165-
echo "Command succeeded with exit code 0"
166-
exit 0
167-
fi
168-
else
169-
echo "Expected output 'Run latency (ms):' not found in output"
273+
if ! echo "$OUTPUT" | grep -iq "poem"; then
274+
echo "Expected output 'poem' not found in output"
170275
exit 1
171276
fi
277+
278+
if [ $EXIT_CODE -ne 0 ]; then
279+
echo "Unexpected exit code: $EXIT_CODE"
280+
exit $EXIT_CODE
281+
fi
172282
echo "::endgroup::"

backends/aoti/aoti_model_container.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ AOTInductorModelContainerGetNumOutputsFunc
2525
AOTInductorModelContainerGetNumOutputs = nullptr;
2626
AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr;
2727

28-
// Global function pointers needed by Metal backend
28+
// Additional global function pointers for AOT Inductor model container
29+
// operations needed by Metal backend
2930
AOTInductorModelContainerGetInputNameFunc
3031
AOTInductorModelContainerGetInputName = nullptr;
3132
AOTInductorModelContainerGetNumConstantsFunc

backends/aoti/aoti_model_container.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,17 +70,21 @@ extern AOTInductorModelContainerGetNumOutputsFunc
7070
AOTInductorModelContainerGetNumOutputs;
7171
extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;
7272

73-
// Function pointer types needed by Metal backend
73+
// Retrieves the name of an input tensor by index from the AOTI model container.
74+
// Needed by Metal backend
7475
using AOTInductorModelContainerGetInputNameFunc = AOTIRuntimeError (*)(
7576
AOTInductorModelContainerHandle container_handle,
7677
size_t input_idx,
7778
const char** input_name);
7879

80+
// Retrieves the number of constants from the AOTI model container.
81+
// Needed by Metal backend
7982
using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
8083
AOTInductorModelContainerHandle container_handle,
8184
size_t* num_constants);
8285

83-
// Global function pointers needed by Metal backend
86+
// Global function pointers (will be loaded dynamically).
87+
// Needed by Metal backend
8488
extern AOTInductorModelContainerGetInputNameFunc
8589
AOTInductorModelContainerGetInputName;
8690
extern AOTInductorModelContainerGetNumConstantsFunc

backends/aoti/common_shims.cpp

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,32 @@ AOTITorchError aoti_torch_get_storage_offset(
5151

5252
AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
5353
auto it = internal::tensor_to_strides.find(tensor);
54+
bool needs_update = false;
55+
5456
if (it == internal::tensor_to_strides.end()) {
57+
needs_update = true;
58+
} else {
59+
// CRITICAL: Multimodal models reuse tensors with different shapes across
60+
// executions (e.g., variable-length audio). We MUST validate cached
61+
// metadata matches current tensor state, or CUDA kernels will receive
62+
// incorrect shapes leading to memory corruption and segfaults.
63+
auto tensor_strides = tensor->strides();
64+
needs_update = !std::equal(
65+
it->second.begin(),
66+
it->second.end(),
67+
tensor_strides.begin(),
68+
tensor_strides.end());
69+
}
70+
71+
if (needs_update) {
5572
std::vector<int64_t> strides(tensor->dim());
5673
auto tensor_strides = tensor->strides();
5774
for (int i = 0; i < tensor->dim(); i++) {
5875
strides[i] = tensor_strides[i];
5976
}
60-
it = internal::tensor_to_strides.emplace(tensor, std::move(strides)).first;
77+
it =
78+
internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
79+
.first;
6180
}
6281

6382
// For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -80,13 +99,31 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
8099

81100
AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
82101
auto it = internal::tensor_to_sizes.find(tensor);
102+
bool needs_update = false;
103+
83104
if (it == internal::tensor_to_sizes.end()) {
105+
needs_update = true;
106+
} else {
107+
// CRITICAL: Multimodal models reuse tensors with different shapes across
108+
// executions (e.g., variable-length audio). We MUST validate cached
109+
// metadata matches current tensor state, or CUDA kernels will receive
110+
// incorrect shapes leading to memory corruption and segfaults.
111+
auto tensor_sizes = tensor->sizes();
112+
needs_update = !std::equal(
113+
it->second.begin(),
114+
it->second.end(),
115+
tensor_sizes.begin(),
116+
tensor_sizes.end());
117+
}
118+
119+
if (needs_update) {
84120
std::vector<int64_t> sizes(tensor->dim());
85121
auto tensor_sizes = tensor->sizes();
86122
for (int i = 0; i < tensor->dim(); i++) {
87123
sizes[i] = tensor_sizes[i];
88124
}
89-
it = internal::tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
125+
it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
126+
.first;
90127
}
91128

92129
// For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -139,17 +176,18 @@ int32_t aoti_torch_dtype_int64() {
139176
return 4; // PyTorch's int64 dtype code
140177
}
141178

179+
// Dtype utility function needed by Metal backend.
180+
// Returns the size of the dtype in bytes.
181+
size_t aoti_torch_dtype_element_size(int32_t dtype) {
182+
return dtype_to_element_size(dtype);
183+
}
184+
142185
// Cleanup functions
143186
void cleanup_tensor_metadata() {
144187
internal::tensor_to_sizes.clear();
145188
internal::tensor_to_strides.clear();
146189
}
147190

148-
// Needed by Metal backend
149-
size_t aoti_torch_dtype_element_size(int32_t dtype) {
150-
return dtype_to_element_size(dtype);
151-
}
152-
153191
} // extern "C"
154192

155193
} // namespace aoti

backends/aoti/common_shims.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,16 @@ int32_t aoti_torch_dtype_float32();
6161
int32_t aoti_torch_dtype_bfloat16();
6262
int32_t aoti_torch_dtype_int64();
6363

64+
// Dtype utility function needed by Metal backend
65+
size_t aoti_torch_dtype_element_size(int32_t dtype);
66+
6467
// Autograd mode functions
6568
int32_t aoti_torch_grad_mode_is_enabled();
6669
void aoti_torch_grad_mode_set_enabled(bool enabled);
6770

6871
// Cleanup functions for clearing global state
6972
void cleanup_tensor_metadata();
7073

71-
// Needed by Metal backend
72-
size_t aoti_torch_dtype_element_size(int32_t dtype);
73-
7474
} // extern "C"
7575

7676
} // namespace aoti

0 commit comments

Comments
 (0)