pytorch
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 30 additions & 8 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 30 additions & 8 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README-wheel.md‎
Lines changed: 1 addition & 1 deletion b/‎README-wheel.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/README.md‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 0 additions & 1 deletion
@@ -1 +1 @@
-44d8d54e38c0258357d4e92e1fefe21e845947a3
+09fdbd0a0639b128f712a4f5202ed42ca4c60957
@@ -88,14 +88,26 @@ jobs:
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
   export-voxtral-cuda-artifact:
-    name: export-voxtral-cuda-artifact
+    name: export-voxtral-cuda-${{ matrix.quant.name }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     secrets: inherit
     strategy:
       fail-fast: false
+      matrix:
+        quant:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+            extra_args: ""
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+            extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
+            # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+            extra_args: "--qlinear_encoder 4w"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -104,7 +116,7 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      upload-artifact: voxtral-cuda-export
+      upload-artifact: ${{ matrix.quant.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -122,14 +134,16 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export Voxtral"
+        echo "::group::Export Voxtral (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
         optimum-cli export executorch \
             --model "mistralai/Voxtral-Mini-3B-2507" \
             --task "multimodal-text-to-text" \
             --recipe "cuda" \
             --dtype bfloat16 \
             --device cuda \
             --max_seq_len 1024 \
+            ${EXTRA_ARGS} \
             --output_dir ./
         python -m executorch.extension.audio.mel_spectrogram \
             --feature_size 128 \
@@ -142,7 +156,7 @@ jobs:
         test -f voxtral_preprocessor.pte
         echo "::endgroup::"
 
-        echo "::group::Store Voxtral Artifacts"
+        echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }})"
         mkdir -p "${RUNNER_ARTIFACT_DIR}"
         cp model.pte "${RUNNER_ARTIFACT_DIR}/"
         cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -306,22 +320,30 @@ jobs:
         echo "::endgroup::"
 
   test-voxtral-cuda-e2e:
-    name: test-voxtral-cuda-e2e
+    name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
     needs: export-voxtral-cuda-artifact
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     strategy:
       fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      download-artifact: voxtral-cuda-export
+      download-artifact: ${{ matrix.format.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -331,7 +353,7 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Prepare Voxtral Artifacts"
+        echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
         cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
         cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
         cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
@@ -360,7 +382,7 @@ jobs:
         cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
         echo "::endgroup::"
 
-        echo "::group::Run Voxtral Runner"
+        echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
         set +e
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
 
@@ -24,8 +24,8 @@ For Apple, please refer to the [iOS documentation](docs/source/using-executorch-
 executorch
 ├── <a href="backends">backends</a> - Backend delegate implementations for various hardware targets. Each backend uses partitioner to split the graph into subgraphs that can be executed on specific hardware, quantizer to optimize model precision, and runtime components to execute the graph on target hardware. For details refer to the <a href="docs/source/backend-delegates-integration.md">backend documentation</a> and the <a href="docs/source/using-executorch-export.md">Export and Lowering tutorial</a> for more information.
 │   ├── <a href="backends/apple">apple</a> - Apple-specific backends.
-│   │   ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends-coreml.md">doc</a>.
-│   │   └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends-mps.md">doc</a>.
+│   │   ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends/coreml/coreml-overview.md">doc</a>.
+│   │   └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends/mps/mps-overview.md">doc</a>.
 │   ├── <a href="backends/arm">arm</a> - ARM architecture backends. See <a href="docs/source/backends-arm-ethos-u.md">doc</a>.
 │   ├── <a href="backends/cadence">cadence</a> - Cadence-specific backends. See <a href="docs/source/backends-cadence.md">doc</a>.
 │   ├── <a href="backends/example">example</a> - Example backend implementations.
 
@@ -12,7 +12,7 @@ The prebuilt `executorch.runtime` module included in this package provides a way
 to run ExecuTorch `.pte` files, with some restrictions:
 * Only [core ATen operators](docs/source/ir-ops-set-definition.md) are linked into the prebuilt module
 * Only the [XNNPACK backend delegate](docs/source/backends-xnnpack.md) is linked into the prebuilt module.
-* \[macOS only] [Core ML](docs/source/backends-coreml.md) and [MPS](docs/source/backends-mps.md) backend
+* \[macOS only] [Core ML](docs/source/backends/coreml/coreml-overview.md) and [MPS](docs/source/backends/mps/mps-overview.md) backend
   are also linked into the prebuilt module.
 
 Please visit the [ExecuTorch website](https://pytorch.org/executorch) for
 
@@ -1,7 +1,7 @@
 # ExecuTorch Core ML Delegate
 
 This subtree contains the Core ML Delegate implementation for ExecuTorch.
-Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.  To learn how to use the CoreML delegate, see the [documentation](https://github.com/pytorch/executorch/blob/main/docs/source/backends-coreml.md). 
+Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.  To learn how to use the CoreML delegate, see the [documentation](https://github.com/pytorch/executorch/blob/main/docs/source/backends/coreml/coreml-overview.md).
 
 ## Layout
 - `compiler/` : Lowers a module to Core ML backend.
 
@@ -60,7 +60,6 @@ def _validate_ref_impl_exists() -> None:
         "cadence::quantized_softmax.per_tensor",
         "cadence::quantized_conv2d_nchw",  # We should only support per_tensor variant, should remove
         "cadence::quantized_relu",  # We should only support per_tensor variant, should remove
-        "cadence::linalg_svd",
         "cadence::quantized_conv2d_nhwc",  # We should only support per_tensor variant, should remove
         "cadence::quantized_softmax",
         "cadence::quantized_w8a32_gru",
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-44d8d54e38c0258357d4e92e1fefe21e845947a3`
	`1`	`+09fdbd0a0639b128f712a4f5202ed42ca4c60957`