Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 130 additions & 20 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ jobs:
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda

test-voxtral-cuda-e2e:
name: test-voxtral-cuda-e2e
export-voxtral-cuda-artifact:
name: export-voxtral-cuda-artifact
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
Expand All @@ -104,6 +104,7 @@ jobs:
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
upload-artifact: voxtral-cuda-export
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux
Expand All @@ -118,6 +119,7 @@ jobs:
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
pip install mistral-common librosa
pip list
echo "::endgroup::"

echo "::group::Export Voxtral"
Expand All @@ -129,9 +131,58 @@ jobs:
--device cuda \
--max_seq_len 1024 \
--output_dir ./
python -m executorch.extension.audio.mel_spectrogram \
--feature_size 128 \
--stack_output \
--max_audio_len 300 \
--output_file voxtral_preprocessor.pte

test -f model.pte
test -f aoti_cuda_blob.ptd
test -f voxtral_preprocessor.pte
echo "::endgroup::"

echo "::group::Build Voxtral Runner"
echo "::group::Store Voxtral Artifacts"
mkdir -p "${RUNNER_ARTIFACT_DIR}"
cp model.pte "${RUNNER_ARTIFACT_DIR}/"
cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
ls -al "${RUNNER_ARTIFACT_DIR}"
echo "::endgroup::"

benchmark-voxtral-cuda:
name: benchmark-voxtral-cuda
needs: export-voxtral-cuda-artifact
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
download-artifact: voxtral-cuda-export
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux

echo "::group::Setup ExecuTorch Requirements"
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
pip list
echo "::endgroup::"

echo "::group::Prepare Voxtral Artifacts"
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
ls -al model.pte aoti_cuda_blob.ptd
echo "::endgroup::"

echo "::group::Build Voxtral Benchmark"
cmake -DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_CUDA=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
Expand All @@ -142,31 +193,90 @@ jobs:
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
echo "::endgroup::"

echo "::group::Run Voxtral Benchmark"

export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd

echo "::endgroup::"

test-voxtral-cuda-e2e:
name: test-voxtral-cuda-e2e
needs: export-voxtral-cuda-artifact
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
use-custom-docker-registry: false
submodules: recursive
download-artifact: voxtral-cuda-export
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
set -eux

echo "::group::Setup ExecuTorch Requirements"
CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
pip list
echo "::endgroup::"

echo "::group::Prepare Voxtral Artifacts"
cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
curl -L $TOKENIZER_URL -o tekken.json
ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
echo "::endgroup::"

echo "::group::Download Test Audio File"
AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
curl -L $AUDIO_URL -o dancing.wav
echo "::endgroup::"

echo "::group::Build Voxtral Runner"
cmake --preset llm \
-DEXECUTORCH_BUILD_CUDA=ON \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-Bcmake-out -S.
cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release

cmake -DEXECUTORCH_BUILD_CUDA=ON \
-DCMAKE_BUILD_TYPE=Release \
-Sexamples/models/voxtral \
-Bcmake-out/examples/models/voxtral/
cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
echo "::endgroup::"

echo "::group::Run Voxtral Runner"
# Capture output and allow exit code 139 if we have the expected printout
set +e
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
--model_path model.pte \
--data_path aoti_cuda_blob.ptd \
--tokenizer_path tekken.json \
--audio_path dancing.wav \
--processor_path voxtral_preprocessor.pte \
--temperature 0 2>&1)
EXIT_CODE=$?
set -e

echo "$OUTPUT"

# Check if the output contains "Run latency (ms):"
if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
echo "Found expected output: 'Run latency (ms):'"
if [ $EXIT_CODE -eq 139 ]; then
echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
exit 0
elif [ $EXIT_CODE -ne 0 ]; then
echo "Unexpected exit code: $EXIT_CODE"
exit $EXIT_CODE
else
echo "Command succeeded with exit code 0"
exit 0
fi
else
echo "Expected output 'Run latency (ms):' not found in output"
if ! echo "$OUTPUT" | grep -iq "dancing"; then
echo "Expected output 'dancing' not found in output"
exit 1
fi

if [ $EXIT_CODE -ne 0 ]; then
echo "Unexpected exit code: $EXIT_CODE"
exit $EXIT_CODE
fi
echo "::endgroup::"
45 changes: 43 additions & 2 deletions backends/aoti/common_shims.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,34 @@ AOTITorchError aoti_torch_get_storage_offset(

AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
auto it = internal::tensor_to_strides.find(tensor);
bool needs_update = false;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you make docblock something like this?

// CRITICAL: Multimodal models reuse tensors with different shapes across
// executions (e.g., variable-length audio). We MUST validate cached metadata
// matches current tensor state, or CUDA kernels will receive incorrect shapes
// leading to memory corruption and segfaults.

if (it == internal::tensor_to_strides.end()) {
needs_update = true;
} else {
// Check if cached values are still valid
auto tensor_strides = tensor->strides();
if (it->second.size() != static_cast<size_t>(tensor->dim())) {
needs_update = true;
} else {
for (int i = 0; i < tensor->dim(); i++) {
if (it->second[i] != tensor_strides[i]) {
needs_update = true;
break;
}
}
}
}

if (needs_update) {
std::vector<int64_t> strides(tensor->dim());
auto tensor_strides = tensor->strides();
for (int i = 0; i < tensor->dim(); i++) {
strides[i] = tensor_strides[i];
}
it = internal::tensor_to_strides.emplace(tensor, std::move(strides)).first;
it =
internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
.first;
}

// For 0D tensors, data() returns nullptr on empty vectors, but we need to
Expand All @@ -80,13 +101,33 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {

AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
auto it = internal::tensor_to_sizes.find(tensor);
bool needs_update = false;

if (it == internal::tensor_to_sizes.end()) {
needs_update = true;
} else {
// Check if cached values are still valid
auto tensor_sizes = tensor->sizes();
if (it->second.size() != static_cast<size_t>(tensor->dim())) {
needs_update = true;
} else {
for (int i = 0; i < tensor->dim(); i++) {
if (it->second[i] != tensor_sizes[i]) {
needs_update = true;
break;
}
}
}
}

if (needs_update) {
std::vector<int64_t> sizes(tensor->dim());
auto tensor_sizes = tensor->sizes();
for (int i = 0; i < tensor->dim(); i++) {
sizes[i] = tensor_sizes[i];
}
it = internal::tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
.first;
}

// For 0D tensors, data() returns nullptr on empty vectors, but we need to
Expand Down
2 changes: 2 additions & 0 deletions backends/cuda/cuda_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ def preprocess(
"aot_inductor.link_libtorch": False,
# Package model constants and other generated files directly in the shared object (.so) file
"aot_inductor.package_constants_in_so": True,
# Enable debug mode if the DEBUG environment variable is set
"aot_inductor.debug_compile": os.environ.get("DEBUG") == "1",
# Enable maximum automatic tuning for optimal performance
"max_autotune": True,
# Use TRITON for GEMM (General Matrix Multiply) operations tuning only to avoid using operators in libtorch
Expand Down
9 changes: 7 additions & 2 deletions backends/cuda/runtime/cuda_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,13 @@ class ET_EXPERIMENTAL CudaBackend final
Span<EValue*> args) const override {
AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;

// Need to re-register all the symbols from the so_handle hosted by this
// CudaBackend instance. The reason is that these symbols are
// static/singleton across the whole process. When we share multiple methods
// (meaning multiple so_handle) in the same process, we need to re-register
// the symbols from the so_handle that is being used in this execution.
register_shared_library_functions(handle->so_handle);

size_t n_inputs;
AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);

Expand Down Expand Up @@ -223,7 +230,6 @@ class ET_EXPERIMENTAL CudaBackend final
"Failed to copy input %d from CPU to GPU",
i);
}
ET_LOG(Info, "Inputs copied to GPU");
// Process output tensors: create GPU counterparts for ExecuTorch CPU
// tensors
for (int i = 0; i < n_outputs; i++) {
Expand Down Expand Up @@ -253,7 +259,6 @@ class ET_EXPERIMENTAL CudaBackend final

gpu_outputs[i] = gpu_output_handle;
}
ET_LOG(Info, "Outputs created on GPU");
// Run AOTI container with GPU tensors
AOTIRuntimeError error = AOTInductorModelContainerRun(
handle->container_handle,
Expand Down
7 changes: 7 additions & 0 deletions examples/models/voxtral/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ list(
extension_flat_tensor
)

# Link CUDA backend
if(EXECUTORCH_BUILD_CUDA)
find_package(CUDAToolkit REQUIRED)
list(APPEND link_libraries aoti_cuda)
executorch_target_link_options_shared_lib(aoti_cuda)
endif()

# Add tokenizers
list(APPEND link_libraries tokenizers::tokenizers)

Expand Down
35 changes: 34 additions & 1 deletion extension/llm/runner/multimodal_prefiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,47 @@ Result<uint64_t> MultimodalPrefiller::prefill(
} else if (input.is_audio()) {
Audio audio = input.get_audio();

// Use Audio::toTensor() for tensor creation
auto method_meta = ET_UNWRAP(
module_->method_meta(kAudioEncoderMethod),
"Failed to get method_meta for %s",
kAudioEncoderMethod);

ET_CHECK_OR_RETURN_ERROR(
method_meta.num_inputs() > 0,
InvalidArgument,
"Audio encoder should have at least 1 input");
auto input_meta = ET_UNWRAP(
method_meta.input_tensor_meta(0),
"Cannot get input tensor meta at index 0");
auto expected_dtype = input_meta.scalar_type();

// Create tensor with original dtype
auto audio_tensor =
ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");

// Convert to expected dtype if needed
if (audio_tensor->scalar_type() != expected_dtype) {
if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
// Convert to bfloat16
audio_tensor = ET_UNWRAP(
convert_to_bfloat16(audio_tensor),
"Failed to convert audio tensor to bfloat16");
} else {
ET_LOG(
Error,
"Unsupported audio encoder input dtype: %s. Expecting %s",
::executorch::runtime::toString(audio_tensor->scalar_type()),
::executorch::runtime::toString(expected_dtype));
return ::executorch::runtime::Error::NotSupported;
}
}

ET_LOG(
Info,
"Audio tensor dim: %zu, dtype: %s",
audio_tensor->dim(),
::executorch::runtime::toString(audio_tensor->scalar_type()));

// Run audio encoder
auto audio_encoder_result =
module_->execute(kAudioEncoderMethod, audio_tensor);
Expand Down
Loading
Loading