pytorch
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 205 additions & 2 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 205 additions & 2 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 6 additions & 6 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎backends/cadence/aot/ref_implementations.py‎
Lines changed: 11 additions & 0 deletions b/‎backends/cadence/aot/ref_implementations.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎backends/cadence/aot/tests/test_ref_implementations.py‎
Lines changed: 9 additions & 0 deletions b/‎backends/cadence/aot/tests/test_ref_implementations.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎backends/cuda/CMakeLists.txt‎
Lines changed: 39 additions & 6 deletions b/‎backends/cuda/CMakeLists.txt‎
Lines changed: 39 additions & 6 deletions
diff --git a/‎backends/cuda/cuda_backend.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/cuda/cuda_backend.py‎
Lines changed: 3 additions & 0 deletions
@@ -164,6 +164,75 @@ jobs:
         ls -al "${RUNNER_ARTIFACT_DIR}"
         echo "::endgroup::"
 
+  export-gemma3-cuda-artifact:
+    name: export-gemma3-cuda-${{ matrix.quant.name }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+      matrix:
+        quant:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+            extra_args: ""
+          # TODO: enable gemma3 quantization
+          # - name: "quantized-int4-tile-packed"
+          #   artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          #   extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "voxtral-cuda-quantized-int4-weight-only"
+          #   # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+          #   extra_args: "--qlinear_encoder 4w"
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      upload-artifact: gemma3-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
+        optimum-cli export executorch \
+            --model "google/gemma-3-4b-it" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 64 \
+            --output_dir ./
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}/"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}/"
+        echo "::endgroup::"
+
   benchmark-voxtral-cuda:
     name: benchmark-voxtral-cuda
     needs: export-voxtral-cuda-artifact
@@ -204,13 +273,63 @@ jobs:
               -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
               -DEXECUTORCH_BUILD_TESTS=ON \
               -Bcmake-out .
-        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
         echo "::endgroup::"
 
         echo "::group::Run Voxtral Benchmark"
 
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
-        cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
+        cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
+
+        echo "::endgroup::"
+
+  benchmark-gemma3-cuda:
+    name: benchmark-gemma3-cuda
+    needs: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: gemma3-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Gemma3 Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        ls -al model.pte aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Build Gemma3 Benchmark"
+        cmake -DCMAKE_BUILD_TYPE=Release \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+              -DEXECUTORCH_BUILD_TESTS=ON \
+              -Bcmake-out .
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
+        echo "::endgroup::"
+
+        echo "::group::Run Gemma3 Benchmark"
+
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
 
         echo "::endgroup::"
 
@@ -302,3 +421,87 @@ jobs:
           exit $EXIT_CODE
         fi
         echo "::endgroup::"
+
+  test-gemma3-cuda-e2e:
+    name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
+    needs: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "gemma3-cuda-export"
+          # TODO: enable quantized gemma3.
+          # - name: "quantized-int4-tile-packed"
+          #   artifact: "gemma3-cuda-quantized-int4-tile-packed"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "gemma3-cuda-quantized-int4-weight-only"
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: ${{ matrix.format.artifact }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
+        curl -L $TOKENIZER_URL -o tokenizer.json
+        ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
+        IMAGE_PATH="docs/source/_static/img/et-logo.png"
+        echo "::endgroup::"
+
+        echo "::group::Build Gemma3 Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/gemma3 \
+              -Bcmake-out/examples/models/gemma3/
+        cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
+        echo "::endgroup::"
+
+        echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tokenizer.json \
+              --image_path $IMAGE_PATH \
+              --temperature 0 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        if ! echo "$OUTPUT" | grep -iq "chip"; then
+          echo "Expected output 'chip' not found in output"
+          exit 1
+        fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
+        echo "::endgroup::"
@@ -53,7 +53,6 @@ def _validate_ref_impl_exists() -> None:
     # 1. be removed
     # 2. have a reference implementation added to ref_implementations.py
     _WARN_ONLY = {
-        "cadence::_softmax_f32_f32",
         "cadence::quantized_softmax.per_tensor",
         "cadence::quantized_softmax",
         "cadence::quantized_w8a32_gru",
@@ -640,10 +639,10 @@ def register_fake(
     "int sampling_ratio, bool aligned) -> (Tensor out)"
 )
 lib.define(
-    "_softmax_f32_f32(Tensor self, int dim, bool? half_to_float) -> (Tensor out)"
+    "_softmax_f32_f32(Tensor self, int dim, bool? half_to_float = None) -> (Tensor out)"
 )
 lib.define(
-    "_softmax_f32_f32.out(Tensor self, int dim, bool? half_to_float, *, Tensor(a!) out) -> Tensor(a!)"
+    "_softmax_f32_f32.out(Tensor self, int dim, bool? half_to_float = None, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
 lib.define(
@@ -2652,12 +2651,13 @@ def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_meta(
 
 @register_fake("cadence::_softmax_f32_f32")
 def softmax_f32_f32_meta(
-    self: torch.Tensor,
+    input_tensor: torch.Tensor,
     dim: int,
-    dtype: torch.dtype,
     half_to_float: Optional[bool] = None,
 ) -> torch.Tensor:
-    return self.new_empty(self.size(), dtype=self.dtype)
+    assert input_tensor.dtype == torch.float32, "input_tensor must be float32"
+    assert not half_to_float, "half_to_float is not supported"
+    return input_tensor.new_empty(input_tensor.size(), dtype=torch.float32)
 
 
 @register_fake("cadence::quantized_softmax")
 
@@ -1979,3 +1979,14 @@ def linalg_svd(
     assert compute_uv
     U, S, Vh = torch.linalg.svd(A, full_matrices=full_matrices, driver=driver)
     return U.contiguous(), S.contiguous(), Vh.contiguous()
+
+
+@impl_tracked(m, "_softmax_f32_f32")
+def softmax_f32_f32(
+    input_tensor: torch.Tensor,
+    dim: int,
+    half_to_float: bool | None = None,
+) -> torch.Tensor:
+    assert input_tensor.dtype == torch.float32, "input_tensor must be float32"
+    assert not half_to_float, "half_to_float is not supported"
+    return torch.nn.functional.softmax(input_tensor, dim=dim, dtype=torch.float32)
@@ -2885,3 +2885,12 @@ def test_quantized_layer_norm(self) -> None:
             output_scale,
             output_zero_point,
         )
+
+    def test_softmax_f32_f32(self) -> None:
+        # Just a wrapper around torch.nn.functional.softmax, so just ensure that it runs
+        input_tensor = torch.tensor(
+            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=torch.float32
+        )
+        output = torch.ops.cadence._softmax_f32_f32(input_tensor, dim=1)
+        self.assertEqual(output.dtype, torch.float32)
+        self.assertEqual(output.shape, input_tensor.shape)
@@ -34,6 +34,39 @@ find_package(CUDAToolkit REQUIRED)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
+# CUDA tensor maker for backends that support incontiguous tensors
+set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp)
+add_library(cuda_tensor_maker STATIC ${_tensor_maker_sources})
+target_include_directories(
+  cuda_tensor_maker
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+)
+target_compile_options(
+  cuda_tensor_maker
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+# Ensure symbols are exported properly
+if(APPLE)
+  target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic)
+else()
+  target_link_options(
+    cuda_tensor_maker PUBLIC
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+  )
+endif()
+
+# Link against ExecuTorch core libraries
+target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS})
+executorch_target_link_options_shared_lib(cuda_tensor_maker)
+
+install(
+  TARGETS cuda_tensor_maker
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
+
 # CUDA-specific AOTI functionality
 set(_aoti_cuda_sources
     runtime/cuda_backend.cpp
@@ -62,20 +95,20 @@ target_link_options(
   aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
-# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch
+# CUDA libraries
 target_link_libraries(
-  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
 executorch_target_link_options_shared_lib(aoti_cuda)
 
 if(BUILD_TESTING)
-  # Add runtime
-  add_executable(voxtral_runner tests/voxtral_runner.cpp)
+  add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp)
   target_link_libraries(
-    voxtral_runner PUBLIC aoti_cuda extension_module_static
-                          extension_flat_tensor portable_ops_lib
+    multimodal_benchmark PUBLIC aoti_cuda extension_module_static
+                                extension_flat_tensor portable_ops_lib
   )
 endif()
 
 
@@ -140,6 +140,9 @@ def preprocess(
                 user_input_placeholders.append(node.meta["val"])
 
         options: dict[str, typing.Any] = {
+            # Disable this to support sdpa decomposition
+            # TODO(gasoonjia): remove it after pin bump to latest pytorch
+            "loop_ordering_after_fusion": False,
             # Better model precision
             "emulate_precision_casts": True,
             # Embed CUDA kernel binaries directly into the compiled shared object