From b1d4dabe3fffa170a6f0f8dae43eca0aea85efba Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Wed, 24 Sep 2025 16:14:41 -0700
Subject: [PATCH 1/3] Implement quantization fallback to 8w per channel if
 block size is indivisible

---
 optimum/exporters/executorch/quantization.py | 51 ++++++++++++++++++--
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py
index 395ec55..2dd9ee1 100644
--- a/optimum/exporters/executorch/quantization.py
+++ b/optimum/exporters/executorch/quantization.py
@@ -88,9 +88,52 @@ def quantize_model_(
                 granularity=linear_weight_granularity,
             ),
         }[qlinear_config]
-        quantize_(
-            eager_model,
-            linear_config,
-        )
+
+        if qlinear_group_size > 0:
+            # First, quantize layers that are compatible with group quantization
+            def group_quantizable_filter(module, fqn):
+                if isinstance(module, torch.nn.Linear):
+                    # Check if hidden dimension is divisible by group size
+                    # For Linear layers, weight shape is [out_features, in_features]
+                    # Group quantization typically applies to the in_features dimension (dim=1)
+                    return module.weight.shape[1] % qlinear_group_size == 0
+                return False
+
+            quantize_(
+                eager_model,
+                linear_config,
+                filter_fn=group_quantizable_filter,
+            )
+
+            # Then, quantize incompatible layers with 8w per-channel quantization
+            per_channel_config = IntxWeightOnlyConfig(
+                weight_dtype=torch.int8,
+                granularity=PerAxis(0),
+            )
+
+            def per_channel_filter(module, fqn):
+                if isinstance(module, torch.nn.Linear):
+                    # Only quantize layers that are NOT compatible with group quantization
+                    # and haven't been quantized yet
+                    if hasattr(module.weight, "tensor_impl"):
+                        # Already quantized, skip
+                        return False
+                    return module.weight.shape[1] % qlinear_group_size != 0
+                return False
+
+            logging.info(
+                f"Applying per-channel quantization to linear layers incompatible with group size {qlinear_group_size}."
+            )
+            quantize_(
+                eager_model,
+                per_channel_config,
+                filter_fn=per_channel_filter,
+            )
+        else:
+            # qlinear_group_size == 0, use per-channel for all
+            quantize_(
+                eager_model,
+                linear_config,
+            )
 
     unwrap_tensor_subclass(eager_model)

From da772d24657812c855959f6fa2ae0c1fe69e2e7a Mon Sep 17 00:00:00 2001
From: Mengwei Liu <mengweiliu600267@gmail.com>
Date: Mon, 29 Sep 2025 10:12:25 -0700
Subject: [PATCH 2/3] Add fallback logic

---
 optimum/commands/export/executorch.py         |  16 ++-
 optimum/exporters/executorch/quantization.py  | 111 ++++++++++--------
 .../tasks/multimodal_text_to_text.py          |  14 ++-
 3 files changed, 84 insertions(+), 57 deletions(-)

diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
index 09a907f..c25c1d5 100644
--- a/optimum/commands/export/executorch.py
+++ b/optimum/commands/export/executorch.py
@@ -76,12 +76,14 @@ def parse_args_executorch(parser):
     required_group.add_argument(
         "--qlinear",
         type=str,
-        choices=["8da4w", "4w", "8w"],
+        choices=["8da4w", "4w", "8w", "8da8w", "8da4w,8da8w"],
         required=False,
         help=(
             "Quantization config for decoder linear layers.\n\n"
             "Options:\n"
             "  8da4w - 8-bit dynamic activation, 4-bit weight\n"
+            "  8da8w - 8-bit dynamic activation, 8-bit weight\n"
+            "  8da4w,8da8w - 8-bit dynamic activation, 4-bit weight and 8-bit weight\n"
             "  4w    - 4-bit weight only\n"
             "  8w    - 8-bit weight only"
         ),
@@ -92,12 +94,14 @@ def parse_args_executorch(parser):
     required_group.add_argument(
         "--qlinear_encoder",
         type=str,
-        choices=["8da4w", "4w", "8w"],
+        choices=["8da4w", "4w", "8w", "8da8w", "8da4w,8da8w"],
         required=False,
         help=(
             "Quantization config for linear layers.\n\n"
             "Options:\n"
             "  8da4w - 8-bit dynamic activation, 4-bit weight\n"
+            "  8da8w - 8-bit dynamic activation, 8-bit weight\n"
+            "  8da4w,8da8w - 8-bit dynamic activation, 4-bit weight and 8-bit weight\n"
             "  4w    - 4-bit weight only\n"
             "  8w    - 8-bit weight only"
         ),
@@ -148,15 +152,15 @@ def run(self):
         if self.args.qlinear:
             kwargs["qlinear"] = self.args.qlinear
         if self.args.qlinear_group_size:
-            kwargs["qlinear_group_size"] = self.args.qlinear
+            kwargs["qlinear_group_size"] = self.args.qlinear_group_size
         if self.args.qlinear_encoder:
-            kwargs["qlinear_encoder"] = self.args.qlinear
+            kwargs["qlinear_encoder"] = self.args.qlinear_encoder
         if self.args.qlinear_encoder_group_size:
-            kwargs["qlinear_encoder_group_size"] = self.args.qlinear
+            kwargs["qlinear_encoder_group_size"] = self.args.qlinear_encoder_group_size
         if self.args.qembedding:
             kwargs["qembedding"] = self.args.qembedding
         if self.args.qembedding_group_size:
-            kwargs["qembedding_group_size"] = self.args.qembedding
+            kwargs["qembedding_group_size"] = self.args.qembedding_group_size
         if self.args.max_seq_len:
             kwargs["max_seq_len"] = self.args.max_seq_len
 
diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py
index 2dd9ee1..c846d40 100644
--- a/optimum/exporters/executorch/quantization.py
+++ b/optimum/exporters/executorch/quantization.py
@@ -67,73 +67,92 @@ def quantize_model_(
         )
 
     if qlinear_config:
+        def build_linear_config(config_key: str, granularity):
+            if config_key == "8da4w":
+                return Int8DynamicActivationIntxWeightConfig(
+                    weight_dtype=torch.int4,
+                    weight_granularity=granularity,
+                )
+            if config_key == "4w":
+                return IntxWeightOnlyConfig(
+                    weight_dtype=torch.int4,
+                    granularity=granularity,
+                )
+            if config_key == "8w":
+                return IntxWeightOnlyConfig(
+                    weight_dtype=torch.int8,
+                    granularity=granularity,
+                )
+            if config_key == "8da8w":
+                return Int8DynamicActivationIntxWeightConfig(
+                    weight_dtype=torch.int8,
+                    weight_granularity=PerAxis(0),
+                )
+            raise ValueError(f"Unsupported linear quantization config '{config_key}'.")
+
+        qlinear_configs = [cfg.strip() for cfg in qlinear_config.split(",")]
+        if any(cfg == "" for cfg in qlinear_configs):
+            raise ValueError("Linear quantization config entries must be non-empty.")
+        if len(qlinear_configs) > 2:
+            raise ValueError(
+                "Expected at most one fallback linear quantization config, got more than one comma."
+            )
+
+        primary_linear_config_key = qlinear_configs[0]
+        fallback_linear_config_key = qlinear_configs[1] if len(qlinear_configs) == 2 else None
+
         if qlinear_group_size == 0:
             linear_weight_granularity = PerAxis(0)
+            if fallback_linear_config_key is not None:
+                logging.warning(
+                    "qlinear_group_size is 0, fallback linear config will not be used as all layers will be quantized with per-axis granularity."
+                )
+                fallback_linear_config_key = None
         else:
-            assert qlinear_group_size % 2 == 0, "Linear quantization group size must be a multiple of 2."
+            assert qlinear_group_size % 2 == 0, f"Linear quantization group size must be a multiple of 2, got {qlinear_group_size}."
             linear_weight_granularity = PerGroup(qlinear_group_size)
 
         logging.info("Quantizing linear layers.")
-        linear_config = {
-            "8da4w": Int8DynamicActivationIntxWeightConfig(
-                weight_dtype=torch.int4,
-                weight_granularity=linear_weight_granularity,
-            ),
-            "4w": IntxWeightOnlyConfig(
-                weight_dtype=torch.int4,
-                granularity=linear_weight_granularity,
-            ),
-            "8w": IntxWeightOnlyConfig(
-                weight_dtype=torch.int8,
-                granularity=linear_weight_granularity,
-            ),
-        }[qlinear_config]
+        primary_linear_config = build_linear_config(
+            primary_linear_config_key, linear_weight_granularity
+        )
 
-        if qlinear_group_size > 0:
-            # First, quantize layers that are compatible with group quantization
-            def group_quantizable_filter(module, fqn):
-                if isinstance(module, torch.nn.Linear):
-                    # Check if hidden dimension is divisible by group size
-                    # For Linear layers, weight shape is [out_features, in_features]
-                    # Group quantization typically applies to the in_features dimension (dim=1)
-                    return module.weight.shape[1] % qlinear_group_size == 0
-                return False
+        # First, quantize layers that are compatible with group quantization
+        def quant_filter(module, fqn):
+            if isinstance(module, torch.nn.Linear):
+                # Check if hidden dimension is divisible by group size
+                # For Linear layers, weight shape is [out_features, in_features]
+                # Group quantization typically applies to the in_features dimension (dim=1)
+                return qlinear_group_size == 0 or (module.weight.shape[1] % qlinear_group_size == 0)
+            return False
 
-            quantize_(
-                eager_model,
-                linear_config,
-                filter_fn=group_quantizable_filter,
-            )
+        quantize_(
+            eager_model,
+            primary_linear_config,
+            filter_fn=quant_filter,
+        )
 
-            # Then, quantize incompatible layers with 8w per-channel quantization
-            per_channel_config = IntxWeightOnlyConfig(
-                weight_dtype=torch.int8,
-                granularity=PerAxis(0),
+        # Then, quantize incompatible layers using the fallback per-axis config
+        if fallback_linear_config_key is not None:
+            fallback_linear_config = build_linear_config(
+                fallback_linear_config_key, PerAxis(0)
             )
-
+            
             def per_channel_filter(module, fqn):
                 if isinstance(module, torch.nn.Linear):
                     # Only quantize layers that are NOT compatible with group quantization
                     # and haven't been quantized yet
-                    if hasattr(module.weight, "tensor_impl"):
-                        # Already quantized, skip
-                        return False
-                    return module.weight.shape[1] % qlinear_group_size != 0
+                    return not quant_filter(module, fqn)
                 return False
 
             logging.info(
-                f"Applying per-channel quantization to linear layers incompatible with group size {qlinear_group_size}."
+                f"Applying fallback linear config '{fallback_linear_config_key}' (per-axis)"
+                f" to layers incompatible with group size {qlinear_group_size}."
             )
             quantize_(
                 eager_model,
-                per_channel_config,
+                fallback_linear_config,
                 filter_fn=per_channel_filter,
             )
-        else:
-            # qlinear_group_size == 0, use per-channel for all
-            quantize_(
-                eager_model,
-                linear_config,
-            )
 
     unwrap_tensor_subclass(eager_model)
diff --git a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
index 4449570..8f89df7 100644
--- a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
+++ b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
@@ -218,19 +218,23 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
         quantize_encoder_kwargs["qlinear_group_size"] = qlinear_encoder_group_size
     quantize_model_(**quantize_encoder_kwargs)
 
-    # TODO: quantize other parts of the model, e.g. MultimodalProjector?
-
     # Quantize decoder embeddings.
     quantize_decoder_embedding_kwargs = {
-        "eager_model": getattr(eager_model, decoder_name),
+        "eager_model": eager_model,
         "qembedding_config": qembedding_config,
     }
     if qembedding_group_size is not None:
         quantize_decoder_embedding_kwargs["qembedding_group_size"] = qembedding_group_size
     quantize_model_(**quantize_decoder_embedding_kwargs)
 
-    # TODO: quantize encoder embeddings.
-
+    # Quantize lm_head
+    if hasattr(eager_model, "lm_head") and qlinear_config is not None:
+        quantize_model_(
+            eager_model=eager_model.lm_head,
+            qlinear_config=qlinear_config,
+            qlinear_group_size=qlinear_group_size if qlinear_group_size is not None else 0,
+        )
+    print(eager_model)
     return MultiModalTextToTextExportableModule(
         model=eager_model,
         modality="audio" if audio_encoder_name else "vision",

From a872c53c2b32afddbde917c454ebd32e09d962ad Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Tue, 7 Oct 2025 09:35:16 -0700
Subject: [PATCH 3/3] Works

---
 optimum/commands/export/executorch.py         |  2 +-
 optimum/exporters/executorch/quantization.py  | 33 ++++++++-----------
 .../tasks/multimodal_text_to_text.py          | 20 ++++++-----
 tests/models/test_modeling_gemma3.py          |  6 ++--
 4 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
index c25c1d5..80d6a4c 100644
--- a/optimum/commands/export/executorch.py
+++ b/optimum/commands/export/executorch.py
@@ -101,7 +101,7 @@ def parse_args_executorch(parser):
             "Options:\n"
             "  8da4w - 8-bit dynamic activation, 4-bit weight\n"
             "  8da8w - 8-bit dynamic activation, 8-bit weight\n"
-            "  8da4w,8da8w - 8-bit dynamic activation, 4-bit weight and 8-bit weight\n"
+            "  8da4w,8da8w - 8-bit dynamic activation, 4-bit weight; fallback on 8-bit dynamic activation, 8-bit weight per-channel where group size doesn't divide block size cleanly \n"
             "  4w    - 4-bit weight only\n"
             "  8w    - 8-bit weight only"
         ),
diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py
index c846d40..ded5178 100644
--- a/optimum/exporters/executorch/quantization.py
+++ b/optimum/exporters/executorch/quantization.py
@@ -40,7 +40,7 @@ def quantize_model_(
         if qlinear_config == "8w":
             assert (
                 qembedding_group_size == 0
-            ), "8-bit embedding quantization only supports per-channel at the moment, please use qembedding_group_size = 0."
+            ), "8-bit embedding quantization only supports per-token at the moment, please use qembedding_group_size = 0."
         if qembedding_group_size == 0:
             embedding_weight_granularity = PerAxis(0)
         else:
@@ -67,6 +67,7 @@ def quantize_model_(
         )
 
     if qlinear_config:
+
         def build_linear_config(config_key: str, granularity):
             if config_key == "8da4w":
                 return Int8DynamicActivationIntxWeightConfig(
@@ -94,9 +95,7 @@ def build_linear_config(config_key: str, granularity):
         if any(cfg == "" for cfg in qlinear_configs):
             raise ValueError("Linear quantization config entries must be non-empty.")
         if len(qlinear_configs) > 2:
-            raise ValueError(
-                "Expected at most one fallback linear quantization config, got more than one comma."
-            )
+            raise ValueError("Expected at most one fallback linear quantization config, got more than one comma.")
 
         primary_linear_config_key = qlinear_configs[0]
         fallback_linear_config_key = qlinear_configs[1] if len(qlinear_configs) == 2 else None
@@ -109,16 +108,16 @@ def build_linear_config(config_key: str, granularity):
                 )
                 fallback_linear_config_key = None
         else:
-            assert qlinear_group_size % 2 == 0, f"Linear quantization group size must be a multiple of 2, got {qlinear_group_size}."
+            assert (
+                qlinear_group_size % 2 == 0
+            ), f"Linear quantization group size must be a multiple of 2, got {qlinear_group_size}."
             linear_weight_granularity = PerGroup(qlinear_group_size)
 
         logging.info("Quantizing linear layers.")
-        primary_linear_config = build_linear_config(
-            primary_linear_config_key, linear_weight_granularity
-        )
+        primary_linear_config = build_linear_config(primary_linear_config_key, linear_weight_granularity)
 
         # First, quantize layers that are compatible with group quantization
-        def quant_filter(module, fqn):
+        def per_group_filter(module, fqn):
             if isinstance(module, torch.nn.Linear):
                 # Check if hidden dimension is divisible by group size
                 # For Linear layers, weight shape is [out_features, in_features]
@@ -129,20 +128,16 @@ def quant_filter(module, fqn):
         quantize_(
             eager_model,
             primary_linear_config,
-            filter_fn=quant_filter,
+            filter_fn=per_group_filter,
         )
 
         # Then, quantize incompatible layers using the fallback per-axis config
         if fallback_linear_config_key is not None:
-            fallback_linear_config = build_linear_config(
-                fallback_linear_config_key, PerAxis(0)
-            )
-            
-            def per_channel_filter(module, fqn):
+            fallback_linear_config = build_linear_config(fallback_linear_config_key, PerAxis(0))
+
+            def per_token_filter(module, fqn):
                 if isinstance(module, torch.nn.Linear):
-                    # Only quantize layers that are NOT compatible with group quantization
-                    # and haven't been quantized yet
-                    return not quant_filter(module, fqn)
+                    return module.weight.shape[1] % qlinear_group_size != 0
                 return False
 
             logging.info(
@@ -152,7 +147,7 @@ def per_channel_filter(module, fqn):
             quantize_(
                 eager_model,
                 fallback_linear_config,
-                filter_fn=per_channel_filter,
+                filter_fn=per_token_filter,
             )
 
     unwrap_tensor_subclass(eager_model)
diff --git a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
index 8f89df7..d3bf007 100644
--- a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
+++ b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
@@ -14,6 +14,7 @@
 
 
 import json
+import logging
 import os.path
 
 import torchao
@@ -201,15 +202,24 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
     qembedding_group_size = kwargs.get("qembedding_group_size", None)
 
     # Quantize decoder linear weights.
+    if qlinear_config:
+        logging.info("Quantizing decoder linears...")
     quantize_decoder_kwargs = {
         "eager_model": getattr(eager_model, decoder_name),
         "qlinear_config": qlinear_config,
     }
+    quantize_lm_head_kwargs = {
+        "eager_model": eager_model.lm_head,
+        "qlinear_config": qlinear_config,
+    }
     if qlinear_group_size is not None:
         quantize_decoder_kwargs["qlinear_group_size"] = qlinear_group_size
     quantize_model_(**quantize_decoder_kwargs)
+    quantize_model_(**quantize_lm_head_kwargs)
 
     # Quantize encoder linear weights.
+    if qlinear_encoder_config:
+        logging.info("Quantizing encoder linears...")
     quantize_encoder_kwargs = {
         "eager_model": getattr(eager_model, encoder_name),
         "qlinear_config": qlinear_encoder_config,
@@ -219,6 +229,8 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
     quantize_model_(**quantize_encoder_kwargs)
 
     # Quantize decoder embeddings.
+    if qembedding_config:
+        logging.info("Quantizing decoder embeddings...")
     quantize_decoder_embedding_kwargs = {
         "eager_model": eager_model,
         "qembedding_config": qembedding_config,
@@ -227,14 +239,6 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
         quantize_decoder_embedding_kwargs["qembedding_group_size"] = qembedding_group_size
     quantize_model_(**quantize_decoder_embedding_kwargs)
 
-    # Quantize lm_head
-    if hasattr(eager_model, "lm_head") and qlinear_config is not None:
-        quantize_model_(
-            eager_model=eager_model.lm_head,
-            qlinear_config=qlinear_config,
-            qlinear_group_size=qlinear_group_size if qlinear_group_size is not None else 0,
-        )
-    print(eager_model)
     return MultiModalTextToTextExportableModule(
         model=eager_model,
         modality="audio" if audio_encoder_name else "vision",
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
index ff507bb..eeb6834 100644
--- a/tests/models/test_modeling_gemma3.py
+++ b/tests/models/test_modeling_gemma3.py
@@ -309,9 +309,9 @@ def test_gemma3_image_vision_with_custom_sdpa_kv_cache_8da4w_8we(self):
             use_custom_kv_cache=True,
             qlinear="8da4w",
             qlinear_group_size=32,
-            # Can't quantize the encoder a the moment, hidden dim of 4304 doesn't fit ExecuTorch's
-            # XNNPack 32-group size quantized kernels. See https://github.com/pytorch/executorch/issues/14221.
-            qembedding_config="8w",
+            qlinear_encoder="8da4w,8da8w",
+            qlinear_encoder_group_size=32,
+            qembedding="8w",
         )
 
         # Generate