From 912c4876d2c5bfde8a40a8baf1a9db3b72d5adc4 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 16 Oct 2025 14:50:55 -0700
Subject: [PATCH 1/4] init commit

---
 install_dev.py                        |  2 +-
 optimum/commands/export/executorch.py |  4 +--
 tests/models/test_modeling_gemma3.py  | 40 +++++++++++++++++++++++++--
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/install_dev.py b/install_dev.py
index c31e76a..c32b53b 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250916"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20251015"
     TORCHAO_NIGHTLY_VERSION = "dev20250916"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
     TORCH_NIGHTLY_VERSION = "dev20250916"
diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
index 12398da..e877114 100644
--- a/optimum/commands/export/executorch.py
+++ b/optimum/commands/export/executorch.py
@@ -185,9 +185,7 @@ def run(self):
                     "--qlinear_packing_format can only be used when --device is set to CUDA (e.g., 'cuda', 'cuda:0', etc.)"
                 )
             if not self.args.qlinear or self.args.qlinear != "4w":
-                raise ValueError(
-                    "--qlinear_packing_format can only be used when --qlinear is set to '4w'"
-                )
+                raise ValueError("--qlinear_packing_format can only be used when --qlinear is set to '4w'")
         qlinear_encoder_packing_format = getattr(self.args, "qlinear_encoder_packing_format", None)
         if qlinear_encoder_packing_format:
             if not device or not device.startswith("cuda"):
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
index ff507bb..f54e688 100644
--- a/tests/models/test_modeling_gemma3.py
+++ b/tests/models/test_modeling_gemma3.py
@@ -22,11 +22,15 @@
 import unittest
 
 import pytest
+import torch
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
 from transformers import AutoProcessor, AutoTokenizer
 from transformers.testing_utils import slow
 
-from optimum.executorch import ExecuTorchModelForCausalLM, ExecuTorchModelForMultiModalToText
+from optimum.executorch import (
+    ExecuTorchModelForCausalLM,
+    ExecuTorchModelForMultiModalToText,
+)
 
 from ..utils import check_causal_lm_output_quality, check_multimodal_output_quality
 
@@ -288,7 +292,10 @@ def test_gemma3_image_vision_with_custom_sdpa_kv_cache_8da4w_8we(self):
         processor = AutoProcessor.from_pretrained(model_id)
         image_url = "https://llava-vl.github.io/static/images/view.jpg"
         conversation = [
-            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant."}],
+            },
             {
                 "role": "user",
                 "content": [
@@ -337,3 +344,32 @@ def test_gemma3_image_vision_with_custom_sdpa_kv_cache_8da4w_8we(self):
         self.assertTrue(
             check_multimodal_output_quality(model_id, generated_tokens, conversation, max_perplexity_threshold=5)
         )
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(is_linux_ci, reason="OOM")
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA backend required")
+    def test_gemma3_export_to_executorch_in_cuda_recipe(self):
+        model_id = "unsloth/gemma-3-1b-it"
+        task = "text-generation"
+        recipe = "cuda"
+        output_subdir = "executorch"
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            out_dir: str = f"{tempdir}/executorch"
+            subprocess.run(
+                f"optimum-cli export executorch \
+                    --model {model_id} \
+                    --task {task} \
+                    --recipe {recipe} \
+                    --output_dir {tempdir}/{output_subdir} \
+                    --dtype bfloat16 \
+                    --device cuda \
+                    --max_seq_len 64",
+                shell=True,
+                check=True,
+            )
+            pte_full_path: str = f"{out_dir}/model.pte"
+            ptd_full_path: str = f"{out_dir}/aoti_cuda_blob.ptd"
+            self.assertTrue(os.path.exists(pte_full_path))
+            self.assertTrue(os.path.exists(ptd_full_path))

From 9b0a876a0de14a6771eb24e556b0a6ead45e587f Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 16 Oct 2025 14:58:05 -0700
Subject: [PATCH 2/4] use correct model id and task

---
 tests/models/test_modeling_gemma3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
index f54e688..f6800f8 100644
--- a/tests/models/test_modeling_gemma3.py
+++ b/tests/models/test_modeling_gemma3.py
@@ -350,8 +350,8 @@ def test_gemma3_image_vision_with_custom_sdpa_kv_cache_8da4w_8we(self):
     @pytest.mark.skipif(is_linux_ci, reason="OOM")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA backend required")
     def test_gemma3_export_to_executorch_in_cuda_recipe(self):
-        model_id = "unsloth/gemma-3-1b-it"
-        task = "text-generation"
+        model_id = "google/gemma-3-4b-it"
+        task = "multimodal-text-to-text"
         recipe = "cuda"
         output_subdir = "executorch"
 

From cb8fbcbf6037f843ceada90314d8cb681225f153 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Fri, 17 Oct 2025 10:42:46 -0700
Subject: [PATCH 3/4] remove reduntant op decomp in cuda recipe

---
 optimum/exporters/executorch/recipes/cuda.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/optimum/exporters/executorch/recipes/cuda.py b/optimum/exporters/executorch/recipes/cuda.py
index 5aa6973..812e0b6 100644
--- a/optimum/exporters/executorch/recipes/cuda.py
+++ b/optimum/exporters/executorch/recipes/cuda.py
@@ -64,8 +64,6 @@ def export_to_executorch_with_cuda(
             For encoder-decoder models or multimodal models, it may generate multiple programs.
     """
     # Import here to avoid version conflicts.
-    from torch._inductor.decomposition import conv1d_to_conv2d
-
     from executorch.backends.cuda.cuda_backend import CudaBackend
     from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 
@@ -84,13 +82,7 @@ def _lower_to_executorch(
             key: [CudaPartitioner([CudaBackend.generate_method_name_compile_spec(key)])]
             for key in exported_programs.keys()
         }
-        # Add decompositions for triton to generate kernels.
-        for key, ep in exported_programs.items():
-            exported_programs[key] = ep.run_decompositions(
-                {
-                    aten.conv1d.default: conv1d_to_conv2d,
-                }
-            )
+
         with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]):
             et_prog = to_edge_transform_and_lower(
                 exported_programs,

From 1f81b707dfbcef34d43a0880a2932a2d0a29c02c Mon Sep 17 00:00:00 2001
From: Gasoonjia <gasoonjia@icloud.com>
Date: Fri, 17 Oct 2025 14:45:25 -0700
Subject: [PATCH 4/4] Update install_dev.py

---
 install_dev.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install_dev.py b/install_dev.py
index c32b53b..c31e76a 100644
--- a/install_dev.py
+++ b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20251015"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250916"
     TORCHAO_NIGHTLY_VERSION = "dev20250916"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
     TORCH_NIGHTLY_VERSION = "dev20250916"