pytorch · desertfire · Oct 18, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-44d8d54e38c0258357d4e92e1fefe21e845947a3
+09fdbd0a0639b128f712a4f5202ed42ca4c60957
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -200,23 +200,166 @@ jobs:
 
         echo "::endgroup::"
 
+  export-voxtral-cuda-quantized-int4-tile-packed:
+    name: export-voxtral-cuda-quantized-int4-tile-packed
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      upload-artifact: voxtral-cuda-quantized-int4-tile-packed
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip install mistral-common librosa
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export Voxtral with Quantization (int4-tile-packed)"
+        optimum-cli export executorch \
+            --model "mistralai/Voxtral-Mini-3B-2507" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 1024 \
+            --qlinear 4w \
+            --qlinear_encoder 4w \
+            --qlinear_packing_format tile_packed_to_4d \
+            --qlinear_encoder_packing_format tile_packed_to_4d \
+            --output_dir ./
+        python -m executorch.extension.audio.mel_spectrogram \
+            --feature_size 128 \
+            --stack_output \
+            --max_audio_len 300 \
+            --output_file voxtral_preprocessor.pte
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        test -f voxtral_preprocessor.pte
+        echo "::endgroup::"
+
+        echo "::group::Store Voxtral Quantized Artifacts (int4-tile-packed)"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}"
+        echo "::endgroup::"
+
+  export-voxtral-cuda-quantized-int4-weight-only:
+    name: export-voxtral-cuda-quantized-int4-weight-only
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      upload-artifact: voxtral-cuda-quantized-int4-weight-only
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip install mistral-common librosa
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export Voxtral with Quantization (int4-weight-only)"
+        optimum-cli export executorch \
+            --model "mistralai/Voxtral-Mini-3B-2507" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 1024 \
+            --qlinear 4w \
+            --qlinear_encoder 4w \
+            --output_dir ./
+        python -m executorch.extension.audio.mel_spectrogram \
+            --feature_size 128 \
+            --stack_output \
+            --max_audio_len 300 \
+            --output_file voxtral_preprocessor.pte
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        test -f voxtral_preprocessor.pte
+        echo "::endgroup::"
+
+        echo "::group::Store Voxtral Quantized Artifacts (int4-weight-only)"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}"
+        echo "::endgroup::"
+
   test-voxtral-cuda-e2e:
-    name: test-voxtral-cuda-e2e
-    needs: export-voxtral-cuda-artifact
+    name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
+    needs:
+      - export-voxtral-cuda-artifact
+      - export-voxtral-cuda-quantized-int4-tile-packed
+      - export-voxtral-cuda-quantized-int4-weight-only
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     strategy:
       fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      download-artifact: voxtral-cuda-export
+      download-artifact: ${{ matrix.format.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -226,7 +369,7 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Prepare Voxtral Artifacts"
+        echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
         cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
         cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
         cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
@@ -255,7 +398,7 @@ jobs:
         cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
         echo "::endgroup::"
 
-        echo "::group::Run Voxtral Runner"
+        echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
         set +e
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \

@@ -39,8 +39,6 @@ This exports Voxtral with XNNPack backend acceleration and 4-bit weight/8-bit ac
 ## CUDA Support
 If your environment has CUDA support, you can enable the runner to run on CUDA for improved performance. Follow the export and runtime commands below:
 
-**Note:** We are currently working on quantization support for CUDA. Currently, only bfloat16 dtype is supported for CUDA execution.
-
 ### Exporting with CUDA
 ```
 optimum-cli export executorch \
@@ -57,6 +55,22 @@ This will generate:
 - `model.pte` - The exported model
 - `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime
 
+Furthermore, we support several quantization formats on CUDA.
+To export Voxtral with int4 weight-only quantization, use
+```
+optimum-cli export executorch \
+  --model "mistralai/Voxtral-Mini-3B-2507" \
+  --task "multimodal-text-to-text" \
+  --recipe "cuda" \
+  --dtype bfloat16 \
+  --device cuda \
+  --max_seq_len 1024 \
+  --qlinear 4w \
+  --qlinear_encoder 4w \
+  --qembedding 4w \
+  --output_dir="voxtral"
+```
+
 See the "Building the multimodal runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions.
 
 # Running the model
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		44d8d54e38c0258357d4e92e1fefe21e845947a3
		09fdbd0a0639b128f712a4f5202ed42ca4c60957