Add int4mm test to the CUDA CI flow (pytorch#15181)

desertfire · web-flow · commit af97aa81a9c0 · 2025-10-18T16:39:34.000-04:00
Tested in CI
diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-44d8d54e38c0258357d4e92e1fefe21e845947a3
+09fdbd0a0639b128f712a4f5202ed42ca4c60957
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -88,14 +88,26 @@ jobs:
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
   export-voxtral-cuda-artifact:
-    name: export-voxtral-cuda-artifact
+    name: export-voxtral-cuda-${{ matrix.quant.name }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     secrets: inherit
     strategy:
       fail-fast: false
+      matrix:
+        quant:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+            extra_args: ""
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+            extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
+            # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+            extra_args: "--qlinear_encoder 4w"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -104,7 +116,7 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      upload-artifact: voxtral-cuda-export
+      upload-artifact: ${{ matrix.quant.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -122,14 +134,16 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export Voxtral"
+        echo "::group::Export Voxtral (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
         optimum-cli export executorch \
             --model "mistralai/Voxtral-Mini-3B-2507" \
             --task "multimodal-text-to-text" \
             --recipe "cuda" \
             --dtype bfloat16 \
             --device cuda \
             --max_seq_len 1024 \
+            ${EXTRA_ARGS} \
             --output_dir ./
         python -m executorch.extension.audio.mel_spectrogram \
             --feature_size 128 \
@@ -142,7 +156,7 @@ jobs:
         test -f voxtral_preprocessor.pte
         echo "::endgroup::"
 
-        echo "::group::Store Voxtral Artifacts"
+        echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }})"
         mkdir -p "${RUNNER_ARTIFACT_DIR}"
         cp model.pte "${RUNNER_ARTIFACT_DIR}/"
         cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -201,22 +215,30 @@ jobs:
         echo "::endgroup::"
 
   test-voxtral-cuda-e2e:
-    name: test-voxtral-cuda-e2e
+    name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
     needs: export-voxtral-cuda-artifact
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     strategy:
       fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      download-artifact: voxtral-cuda-export
+      download-artifact: ${{ matrix.format.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -226,7 +248,7 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Prepare Voxtral Artifacts"
+        echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
         cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
         cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
         cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
@@ -255,7 +277,7 @@ jobs:
         cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
         echo "::endgroup::"
 
-        echo "::group::Run Voxtral Runner"
+        echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
         set +e
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md
@@ -39,8 +39,6 @@ This exports Voxtral with XNNPack backend acceleration and 4-bit weight/8-bit ac
 ## CUDA Support
 If your environment has CUDA support, you can enable the runner to run on CUDA for improved performance. Follow the export and runtime commands below:
 
-**Note:** We are currently working on quantization support for CUDA. Currently, only bfloat16 dtype is supported for CUDA execution.
-
 ### Exporting with CUDA
 ```
 optimum-cli export executorch \
@@ -57,6 +55,23 @@ This will generate:
 - `model.pte` - The exported model
 - `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime
 
+Furthermore, we support several quantization formats on CUDA.
+For example, to export Voxtral with int4 weight and int4mm for linear layers, you can use the following command,
+```
+optimum-cli export executorch \
+  --model "mistralai/Voxtral-Mini-3B-2507" \
+  --task "multimodal-text-to-text" \
+  --recipe "cuda" \
+  --dtype bfloat16 \
+  --device cuda \
+  --max_seq_len 1024 \
+  --qlinear 4w \
+  --qlinear_encoder 4w \
+  --qlinear_packing_format tile_packed_to_4d \
+  --qlinear_encoder_packing_format tile_packed_to_4d \
+  --output_dir="voxtral"
+```
+
 See the "Building the multimodal runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions.
 
 # Running the model

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-44d8d54e38c0258357d4e92e1fefe21e845947a3`
	`1`	`+09fdbd0a0639b128f712a4f5202ed42ca4c60957`