pytorch
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 101 additions & 3 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 101 additions & 3 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/test/models/test_nn_modules.py‎
Lines changed: 77 additions & 18 deletions b/‎backends/arm/test/models/test_nn_modules.py‎
Lines changed: 77 additions & 18 deletions
diff --git a/‎backends/arm/test/models/test_resnet18.py‎
Lines changed: 0 additions & 3 deletions b/‎backends/arm/test/models/test_resnet18.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎backends/arm/test/models/test_torch_functions.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/arm/test/models/test_torch_functions.py‎
Lines changed: 0 additions & 1 deletion
@@ -165,14 +165,27 @@ jobs:
         echo "::endgroup::"
 
   export-gemma3-cuda-artifact:
-    name: export-gemma3-cuda-artifact
+    name: export-gemma3-cuda-${{ matrix.quant.name }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     secrets: inherit
     strategy:
       fail-fast: false
+      matrix:
+        quant:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+            extra_args: ""
+          # TODO: enable gemma3 quantization
+          # - name: "quantized-int4-tile-packed"
+          #   artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          #   extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "voxtral-cuda-quantized-int4-weight-only"
+          #   # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+          #   extra_args: "--qlinear_encoder 4w"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -198,7 +211,8 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export Gemma3"
+        echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
         optimum-cli export executorch \
             --model "google/gemma-3-4b-it" \
             --task "multimodal-text-to-text" \
@@ -212,7 +226,7 @@ jobs:
         test -f aoti_cuda_blob.ptd
         echo "::endgroup::"
 
-        echo "::group::Store Gemma3 Artifacts"
+        echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
         mkdir -p "${RUNNER_ARTIFACT_DIR}/"
         cp model.pte "${RUNNER_ARTIFACT_DIR}/"
         cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
@@ -407,3 +421,87 @@ jobs:
           exit $EXIT_CODE
         fi
         echo "::endgroup::"
+
+  test-gemma3-cuda-e2e:
+    name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
+    needs: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "gemma3-cuda-export"
+          # TODO: enable quantized gemma3.
+          # - name: "quantized-int4-tile-packed"
+          #   artifact: "gemma3-cuda-quantized-int4-tile-packed"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "gemma3-cuda-quantized-int4-weight-only"
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: ${{ matrix.format.artifact }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
+        curl -L $TOKENIZER_URL -o tokenizer.json
+        ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
+        IMAGE_PATH="docs/source/_static/img/et-logo.png"
+        echo "::endgroup::"
+
+        echo "::group::Build Gemma3 Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/gemma3 \
+              -Bcmake-out/examples/models/gemma3/
+        cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
+        echo "::endgroup::"
+
+        echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tokenizer.json \
+              --image_path $IMAGE_PATH \
+              --temperature 0 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        if ! echo "$OUTPUT" | grep -iq "chip"; then
+          echo "Expected output 'chip' not found in output"
+          exit 1
+        fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
+        echo "::endgroup::"
@@ -33,7 +33,7 @@ executorch
 │   ├── <a href="backends/openvino">openvino</a> - OpenVINO backend for Intel hardware.
 │   ├── <a href="backends/qualcomm">qualcomm</a> - Qualcomm-specific backends. See <a href="docs/source/backends-qualcomm.md">doc</a>.
 │   ├── <a href="backends/transforms">transforms</a> - Transformations for backend optimization.
-│   ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends-vulkan.md">doc</a>.
+│   ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends/vulkan/vulkan-overview.md">doc</a>.
 │   └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends/xnnpack/xnnpack-overview.md">doc</a>.
 ├── <a href="codegen">codegen</a> - Tooling to autogenerate bindings between kernels and the runtime.
 ├── <a href="configurations">configurations</a> - Configuration files.
 
@@ -17,32 +17,91 @@
 - Transformer
 """
 
+from typing import Callable
+
 import torch
 from executorch.backends.arm.test.common import parametrize
 from executorch.backends.arm.test.tester.test_pipeline import (
     TosaPipelineFP,
     TosaPipelineINT,
 )
 
+
+def make_module_wrapper(
+    name: str, module_factory: Callable[[], torch.nn.Module]
+) -> torch.nn.Module:
+    class ModuleWrapper(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self._module = module_factory()
+
+        def forward(self, *args, **kwargs):
+            return self._module(*args, **kwargs)
+
+    ModuleWrapper.__name__ = name
+    ModuleWrapper.__qualname__ = name
+    return ModuleWrapper()
+
+
 example_input = torch.rand(1, 6, 16, 16)
 
 module_tests = [
-    (torch.nn.Embedding(10, 10), (torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]),)),
-    (torch.nn.LeakyReLU(), (example_input,)),
-    (torch.nn.BatchNorm1d(16), (torch.rand(6, 16, 16),)),
-    (torch.nn.AdaptiveAvgPool2d((12, 12)), (example_input,)),
-    (torch.nn.ConvTranspose2d(6, 3, 2), (example_input,)),
-    (torch.nn.GRU(10, 20, 2), (torch.randn(5, 3, 10), torch.randn(2, 3, 20))),
-    (torch.nn.GroupNorm(2, 6), (example_input,)),
-    (torch.nn.InstanceNorm2d(16), (example_input,)),
-    (torch.nn.PReLU(), (example_input,)),
     (
-        torch.nn.Transformer(
-            d_model=64,
-            nhead=1,
-            num_encoder_layers=1,
-            num_decoder_layers=1,
-            dtype=torch.float32,
+        make_module_wrapper(
+            "EmbeddingModule",
+            lambda: torch.nn.Embedding(10, 10),
+        ),
+        (torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]),),
+    ),
+    (
+        make_module_wrapper("LeakyReLUModule", torch.nn.LeakyReLU),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper("BatchNorm1dModule", lambda: torch.nn.BatchNorm1d(16)),
+        (torch.rand(6, 16, 16),),
+    ),
+    (
+        make_module_wrapper(
+            "AdaptiveAvgPool2dModule",
+            lambda: torch.nn.AdaptiveAvgPool2d((12, 12)),
+        ),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper(
+            "ConvTranspose2dModule", lambda: torch.nn.ConvTranspose2d(6, 3, 2)
+        ),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper("GRUModule", lambda: torch.nn.GRU(10, 20, 2)),
+        (torch.randn(5, 3, 10), torch.randn(2, 3, 20)),
+    ),
+    (
+        make_module_wrapper("GroupNormModule", lambda: torch.nn.GroupNorm(2, 6)),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper(
+            "InstanceNorm2dModule", lambda: torch.nn.InstanceNorm2d(16)
+        ),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper("PReLUModule", torch.nn.PReLU),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper(
+            "TransformerModule",
+            lambda: torch.nn.Transformer(
+                d_model=64,
+                nhead=1,
+                num_encoder_layers=1,
+                num_decoder_layers=1,
+                dtype=torch.float32,
+            ),
         ),
         (torch.rand((10, 32, 64)), torch.rand((20, 32, 64))),
     ),
@@ -78,9 +137,9 @@ def test_nn_Modules_FP(test_data):
     "test_data",
     test_parameters,
     xfails={
-        "GRU": "RuntimeError: Node aten_linear_default with op <EdgeOpOverload: aten.linear[...]> was not decomposed or delegated.",
-        "PReLU": "RuntimeError: mul(): functions with out=... arguments don't support automatic differentiation, but one of the arguments requires grad.",
-        "Transformer": "AssertionError: Output 0 does not match reference output.",
+        "GRUModule": "RuntimeError: Node aten_linear_default with op <EdgeOpOverload: aten.linear[...]> was not decomposed or delegated.",
+        "PReLUModule": "RuntimeError: mul(): functions with out=... arguments don't support automatic differentiation, but one of the arguments requires grad.",
+        "TransformerModule": "AssertionError: Output 0 does not match reference output.",
     },
 )
 def test_nn_Modules_INT(test_data):
 
@@ -79,9 +79,6 @@ def test_resnet_u55_INT(per_channel_quantization):
 
 
 @pytest.mark.slow
-@pytest.mark.xfail(
-    reason="For resnet18 for Ethos-U85, the SRAM memory footprint is very high. The compiler team is investigating."
-)
 @common.XfailIfNoCorstone320
 @common.parametrize("per_channel_quantization", quant_test_data)
 def test_resnet_u85_INT(per_channel_quantization):
 
@@ -128,7 +128,6 @@ def test_torch_fns_FP(test_data):
         "Requires dynamic output shape.",
         "topk": "NotImplementedError: No registered serialization name for <class 'torch.return_types.topk'> found",
         "sort": "NotImplementedError: No registered serialization name for <class 'torch.return_types.sort'> found",
-        "t": "MLETORCH-855: Issue with Quantization folding.",
     },
     strict=False,
 )
Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,6 @@ def test_torch_fns_FP(test_data):`
`128`	`128`	`"Requires dynamic output shape.",`
`129`	`129`	`"topk": "NotImplementedError: No registered serialization name for <class 'torch.return_types.topk'> found",`
`130`	`130`	`"sort": "NotImplementedError: No registered serialization name for <class 'torch.return_types.sort'> found",`
`131`		`- "t": "MLETORCH-855: Issue with Quantization folding.",`
`132`	`131`	`},`
`133`	`132`	`strict=False,`
`134`	`133`	`)`