pytorch
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 205 additions & 2 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 205 additions & 2 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/common/debug.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/common/debug.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/debug/schema.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/debug/schema.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_abs.py‎
Lines changed: 2 additions & 3 deletions b/‎backends/arm/operators/op_abs.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎backends/arm/operators/op_add.py‎
Lines changed: 1 addition & 6 deletions b/‎backends/arm/operators/op_add.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎backends/arm/operators/op_amax.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/operators/op_amax.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/operators/op_amin.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/operators/op_amin.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/operators/op_any.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/operators/op_any.py‎
Lines changed: 2 additions & 2 deletions
@@ -164,6 +164,75 @@ jobs:
         ls -al "${RUNNER_ARTIFACT_DIR}"
         echo "::endgroup::"
 
+  export-gemma3-cuda-artifact:
+    name: export-gemma3-cuda-${{ matrix.quant.name }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+      matrix:
+        quant:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+            extra_args: ""
+          # TODO: enable gemma3 quantization
+          # - name: "quantized-int4-tile-packed"
+          #   artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          #   extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "voxtral-cuda-quantized-int4-weight-only"
+          #   # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+          #   extra_args: "--qlinear_encoder 4w"
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      upload-artifact: gemma3-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
+        optimum-cli export executorch \
+            --model "google/gemma-3-4b-it" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 64 \
+            --output_dir ./
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}/"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}/"
+        echo "::endgroup::"
+
   benchmark-voxtral-cuda:
     name: benchmark-voxtral-cuda
     needs: export-voxtral-cuda-artifact
@@ -204,13 +273,63 @@ jobs:
               -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
               -DEXECUTORCH_BUILD_TESTS=ON \
               -Bcmake-out .
-        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
         echo "::endgroup::"
 
         echo "::group::Run Voxtral Benchmark"
 
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
-        cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
+        cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
+
+        echo "::endgroup::"
+
+  benchmark-gemma3-cuda:
+    name: benchmark-gemma3-cuda
+    needs: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: gemma3-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Gemma3 Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        ls -al model.pte aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Build Gemma3 Benchmark"
+        cmake -DCMAKE_BUILD_TYPE=Release \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+              -DEXECUTORCH_BUILD_TESTS=ON \
+              -Bcmake-out .
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
+        echo "::endgroup::"
+
+        echo "::group::Run Gemma3 Benchmark"
+
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
 
         echo "::endgroup::"
 
@@ -302,3 +421,87 @@ jobs:
           exit $EXIT_CODE
         fi
         echo "::endgroup::"
+
+  test-gemma3-cuda-e2e:
+    name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
+    needs: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "gemma3-cuda-export"
+          # TODO: enable quantized gemma3.
+          # - name: "quantized-int4-tile-packed"
+          #   artifact: "gemma3-cuda-quantized-int4-tile-packed"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "gemma3-cuda-quantized-int4-weight-only"
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: ${{ matrix.format.artifact }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
+        curl -L $TOKENIZER_URL -o tokenizer.json
+        ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
+        IMAGE_PATH="docs/source/_static/img/et-logo.png"
+        echo "::endgroup::"
+
+        echo "::group::Build Gemma3 Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/gemma3 \
+              -Bcmake-out/examples/models/gemma3/
+        cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
+        echo "::endgroup::"
+
+        echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tokenizer.json \
+              --image_path $IMAGE_PATH \
+              --temperature 0 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        if ! echo "$OUTPUT" | grep -iq "chip"; then
+          echo "Expected output 'chip' not found in output"
+          exit 1
+        fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
+        echo "::endgroup::"
@@ -33,7 +33,7 @@ executorch
 │   ├── <a href="backends/openvino">openvino</a> - OpenVINO backend for Intel hardware.
 │   ├── <a href="backends/qualcomm">qualcomm</a> - Qualcomm-specific backends. See <a href="docs/source/backends-qualcomm.md">doc</a>.
 │   ├── <a href="backends/transforms">transforms</a> - Transformations for backend optimization.
-│   ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends-vulkan.md">doc</a>.
+│   ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends/vulkan/vulkan-overview.md">doc</a>.
 │   └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends/xnnpack/xnnpack-overview.md">doc</a>.
 ├── <a href="codegen">codegen</a> - Tooling to autogenerate bindings between kernels and the runtime.
 ├── <a href="configurations">configurations</a> - Configuration files.
 
@@ -202,7 +202,7 @@ ExecuTorch powers on-device AI at scale across Meta's family of apps, VR/AR devi
 
 **LLMs:** [Llama 3.2/3.1/3](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [LiquidAI LFM2](examples/models/lfm2/README.md)
 
-**Multimodal:** [Llava](examples/models/llava/README.md) (vision-language), [Voxtral](examples/models/voxtral/README.md) (audio-language)
+**Multimodal:** [Llava](examples/models/llava/README.md) (vision-language), [Voxtral](examples/models/voxtral/README.md) (audio-language), [Gemma](examples/models/gemma3) (vision-language)
 
 **Vision/Speech:** [MobileNetV2](https://github.com/meta-pytorch/executorch-examples/tree/main/mv2), [DeepLabV3](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3), [Whisper](https://github.com/meta-pytorch/executorch-examples/tree/main/whisper/android/WhisperApp)
 
 
@@ -7,7 +7,7 @@
 import os
 from typing import Optional
 
-import serializer.tosa_serializer as ts  # type: ignore
+import serializer.tosa_serializer as ts
 import torch
 from executorch.exir.print_program import inspect_node
 
 
@@ -10,7 +10,7 @@
 from dataclasses import asdict, dataclass
 from typing import Any, Optional
 
-import serializer.tosa_serializer as ts  # type: ignore
+import serializer.tosa_serializer as ts
 import torch
 
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
 
@@ -6,6 +6,8 @@
 # pyre-unsafe
 from typing import Any, List
 
+import serializer.tosa_serializer as ts
+
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -36,9 +38,6 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-
-        import serializer.tosa_serializer as ts  # type: ignore
-
         validate_num_inputs(self.target, inputs, 1)
         validate_same_dtype(self.target, [*inputs, output], ts)
 
 
@@ -9,6 +9,7 @@
 
 import executorch.backends.arm.tosa.quant_utils as tqutils
 import executorch.backends.arm.tosa.utils as tutils
+import serializer.tosa_serializer as ts
 
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
@@ -42,9 +43,6 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-
-        import serializer.tosa_serializer as ts  # type: ignore
-
         validate_num_inputs(self.target, inputs, 2)
         validate_same_dtype(self.target, [*inputs, output], ts)
         valid_dtypes = []
@@ -132,9 +130,6 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-
-        import serializer.tosa_serializer as ts  # type: ignore
-
         validate_num_inputs(self.target, inputs, 2)
         validate_same_dtype(self.target, [*inputs, output], ts)
 
 
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 from typing import Any, List
 
+import serializer.tosa_serializer as ts
+
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
@@ -34,8 +36,6 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        import serializer.tosa_serializer as ts
-
         validate_num_inputs(self.target, inputs, 3)
         validate_same_dtype(self.target, [inputs[0], output], ts)
         validate_valid_dtype(
 
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 from typing import Any, List
 
+import serializer.tosa_serializer as ts
+
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
@@ -34,8 +36,6 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        import serializer.tosa_serializer as ts
-
         validate_num_inputs(self.target, inputs, 3)
         validate_same_dtype(self.target, [inputs[0], output], ts)
         validate_valid_dtype(
 
@@ -6,6 +6,8 @@
 # pyre-unsafe
 from typing import Any, cast, List
 
+import serializer.tosa_serializer as ts
+
 from executorch.backends.arm.operators.node_visitor import (  # type: ignore
     NodeVisitor,
     register_node_visitor,
@@ -33,8 +35,6 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        import serializer.tosa_serializer as ts
-
         validate_num_inputs(self.target, inputs, 3)
         validate_same_dtype(self.target, [inputs[0], output], ts)
         validate_valid_dtype(