pytorch · larryliu0820 · Oct 11, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -87,8 +87,8 @@ jobs:
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
-  test-voxtral-cuda-e2e:
-    name: test-voxtral-cuda-e2e
+  export-voxtral-cuda-artifact:
+    name: export-voxtral-cuda-artifact
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -104,6 +104,7 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
+      upload-artifact: voxtral-cuda-export
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -118,6 +119,7 @@ jobs:
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         pip install mistral-common librosa
+        pip list
         echo "::endgroup::"
 
         echo "::group::Export Voxtral"
@@ -129,9 +131,58 @@ jobs:
             --device cuda \
             --max_seq_len 1024 \
             --output_dir ./
+        python -m executorch.extension.audio.mel_spectrogram \
+            --feature_size 128 \
+            --stack_output \
+            --max_audio_len 300 \
+            --output_file voxtral_preprocessor.pte
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        test -f voxtral_preprocessor.pte
         echo "::endgroup::"
 
-        echo "::group::Build Voxtral Runner"
+        echo "::group::Store Voxtral Artifacts"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}"
+        echo "::endgroup::"
+
+  benchmark-voxtral-cuda:
+    name: benchmark-voxtral-cuda
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        ls -al model.pte aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Benchmark"
         cmake -DCMAKE_BUILD_TYPE=Release \
               -DEXECUTORCH_BUILD_CUDA=ON \
               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -142,31 +193,90 @@ jobs:
         cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
         echo "::endgroup::"
 
+        echo "::group::Run Voxtral Benchmark"
+
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
+
+        echo "::endgroup::"
+
+  test-voxtral-cuda-e2e:
+    name: test-voxtral-cuda-e2e
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
+        TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
+        curl -L $TOKENIZER_URL -o tekken.json
+        ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
+        echo "::endgroup::"
+
+        echo "::group::Download Test Audio File"
+        AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+        curl -L $AUDIO_URL -o dancing.wav
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/voxtral \
+              -Bcmake-out/examples/models/voxtral/
+        cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+        echo "::endgroup::"
+
         echo "::group::Run Voxtral Runner"
-        # Capture output and allow exit code 139 if we have the expected printout
         set +e
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
-        OUTPUT=$(cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd 2>&1)
+        OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tekken.json \
+              --audio_path dancing.wav \
+              --processor_path voxtral_preprocessor.pte \
+              --temperature 0 2>&1)
         EXIT_CODE=$?
         set -e
 
         echo "$OUTPUT"
 
-        # Check if the output contains "Run latency (ms):"
-        if echo "$OUTPUT" | grep -q "Run latency (ms):"; then
-          echo "Found expected output: 'Run latency (ms):'"
-          if [ $EXIT_CODE -eq 139 ]; then
-            echo "Exit code 139 (segfault) detected, but passing since we have the expected output"
-            exit 0
-          elif [ $EXIT_CODE -ne 0 ]; then
-            echo "Unexpected exit code: $EXIT_CODE"
-            exit $EXIT_CODE
-          else
-            echo "Command succeeded with exit code 0"
-            exit 0
-          fi
-        else
-          echo "Expected output 'Run latency (ms):' not found in output"
+        if ! echo "$OUTPUT" | grep -iq "dancing"; then
+          echo "Expected output 'dancing' not found in output"
           exit 1
         fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
         echo "::endgroup::"
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
@@ -51,13 +51,34 @@ AOTITorchError aoti_torch_get_storage_offset(
 
 AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
   auto it = internal::tensor_to_strides.find(tensor);
+  bool needs_update = false;
+
   if (it == internal::tensor_to_strides.end()) {
+    needs_update = true;
+  } else {
+    // Check if cached values are still valid
+    auto tensor_strides = tensor->strides();
+    if (it->second.size() != static_cast<size_t>(tensor->dim())) {
+      needs_update = true;
+    } else {
+      for (int i = 0; i < tensor->dim(); i++) {
+        if (it->second[i] != tensor_strides[i]) {
+          needs_update = true;
+          break;
+        }
+      }
+    }
+  }
+
+  if (needs_update) {
     std::vector<int64_t> strides(tensor->dim());
     auto tensor_strides = tensor->strides();
     for (int i = 0; i < tensor->dim(); i++) {
       strides[i] = tensor_strides[i];
     }
-    it = internal::tensor_to_strides.emplace(tensor, std::move(strides)).first;
+    it =
+        internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
+            .first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -80,13 +101,33 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
 
 AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
   auto it = internal::tensor_to_sizes.find(tensor);
+  bool needs_update = false;
+
   if (it == internal::tensor_to_sizes.end()) {
+    needs_update = true;
+  } else {
+    // Check if cached values are still valid
+    auto tensor_sizes = tensor->sizes();
+    if (it->second.size() != static_cast<size_t>(tensor->dim())) {
+      needs_update = true;
+    } else {
+      for (int i = 0; i < tensor->dim(); i++) {
+        if (it->second[i] != tensor_sizes[i]) {
+          needs_update = true;
+          break;
+        }
+      }
+    }
+  }
+
+  if (needs_update) {
     std::vector<int64_t> sizes(tensor->dim());
     auto tensor_sizes = tensor->sizes();
     for (int i = 0; i < tensor->dim(); i++) {
       sizes[i] = tensor_sizes[i];
     }
-    it = internal::tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
+    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
+             .first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -135,6 +135,8 @@ def preprocess(
             "aot_inductor.link_libtorch": False,
             # Package model constants and other generated files directly in the shared object (.so) file
             "aot_inductor.package_constants_in_so": True,
+            # Enable debug mode if the DEBUG environment variable is set
+            "aot_inductor.debug_compile": os.environ.get("DEBUG") == "1",
             # Enable maximum automatic tuning for optimal performance
             "max_autotune": True,
             # Use TRITON for GEMM (General Matrix Multiply) operations tuning only to avoid using operators in libtorch

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -165,6 +165,13 @@ class ET_EXPERIMENTAL CudaBackend final
       Span<EValue*> args) const override {
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
+    // Need to re-register all the symbols from the so_handle hosted by this
+    // CudaBackend instance. The reason is that these symbols are
+    // static/singleton across the whole process. When we share multiple methods
+    // (meaning multiple so_handle) in the same process, we need to re-register
+    // the symbols from the so_handle that is being used in this execution.
+    register_shared_library_functions(handle->so_handle);
+
     size_t n_inputs;
     AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
 
@@ -223,7 +230,6 @@ class ET_EXPERIMENTAL CudaBackend final
           "Failed to copy input %d from CPU to GPU",
           i);
     }
-    ET_LOG(Info, "Inputs copied to GPU");
     // Process output tensors: create GPU counterparts for ExecuTorch CPU
     // tensors
     for (int i = 0; i < n_outputs; i++) {
@@ -253,7 +259,6 @@ class ET_EXPERIMENTAL CudaBackend final
 
       gpu_outputs[i] = gpu_output_handle;
     }
-    ET_LOG(Info, "Outputs created on GPU");
     // Run AOTI container with GPU tensors
     AOTIRuntimeError error = AOTInductorModelContainerRun(
         handle->container_handle,

@@ -86,6 +86,13 @@ list(
   extension_flat_tensor
 )
 
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
+
 # Add tokenizers
 list(APPEND link_libraries tokenizers::tokenizers)
 

@@ -93,14 +93,47 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   } else if (input.is_audio()) {
     Audio audio = input.get_audio();
 
-    // Use Audio::toTensor() for tensor creation
+    auto method_meta = ET_UNWRAP(
+        module_->method_meta(kAudioEncoderMethod),
+        "Failed to get method_meta for %s",
+        kAudioEncoderMethod);
+
+    ET_CHECK_OR_RETURN_ERROR(
+        method_meta.num_inputs() > 0,
+        InvalidArgument,
+        "Audio encoder should have at least 1 input");
+    auto input_meta = ET_UNWRAP(
+        method_meta.input_tensor_meta(0),
+        "Cannot get input tensor meta at index 0");
+    auto expected_dtype = input_meta.scalar_type();
+
+    // Create tensor with original dtype
     auto audio_tensor =
         ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
+
+    // Convert to expected dtype if needed
+    if (audio_tensor->scalar_type() != expected_dtype) {
+      if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
+        // Convert to bfloat16
+        audio_tensor = ET_UNWRAP(
+            convert_to_bfloat16(audio_tensor),
+            "Failed to convert audio tensor to bfloat16");
+      } else {
+        ET_LOG(
+            Error,
+            "Unsupported audio encoder input dtype: %s. Expecting %s",
+            ::executorch::runtime::toString(audio_tensor->scalar_type()),
+            ::executorch::runtime::toString(expected_dtype));
+        return ::executorch::runtime::Error::NotSupported;
+      }
+    }
+
     ET_LOG(
         Info,
         "Audio tensor dim: %zu, dtype: %s",
         audio_tensor->dim(),
         ::executorch::runtime::toString(audio_tensor->scalar_type()));
+
     // Run audio encoder
     auto audio_encoder_result =
         module_->execute(kAudioEncoderMethod, audio_tensor);