up

metascroy · metascroy · commit 19bd7a6bfc2b · 2025-06-10T14:40:02.000-07:00
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -233,7 +233,7 @@ if [[ "${CUSTOM}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
 fi
 if [[ "${QE}" == "ON" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
+  EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,0"
 fi
 if [[ "${MPS}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --mps --disable_dynamic_shape"
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -572,6 +572,7 @@ def _quantize_embedding(model):
             torch.int4,
             torch.int8,
         ], "Only 2, 4, or 8-bit embeddings are supported unless using torchao"
+        print("GRAN", granularity)
         quantize_(
             model,
             IntxWeightOnlyConfig(
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -24,7 +24,7 @@
     replace_kv_cache_with_custom_kv_cache,
 )
 from executorch.examples.models.llama.source_transformation.quantize import (
-    EmbeddingQuantHandler,
+    get_quant_embedding_transform,
     get_quant_weight_transform,
 )
 from executorch.examples.models.llama.source_transformation.sdpa import (
@@ -38,7 +38,6 @@
 )
 
 from executorch.exir.passes import MemoryPlanningPass
-from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
 from executorch.exir.passes.sym_shape_eval_pass import (
     ConstraintBasedSymShapeEvalPass,
     HintBasedSymShapeEvalPass,
@@ -184,15 +183,9 @@ def forward(self, images):
 
 
 def export_token_embedding(llava, prompt):
-    def quant_embedding(model):
-        return EmbeddingQuantHandler(
-            model,
-            bitwidth=8,
-            group_size=32,
-            packed=False,
-        ).quantized_model()
-
-    quantized_token_embed = quant_embedding(llava.model_.language_model.model)
+    quantized_token_embed = get_quant_embedding_transform(
+        llava.model_.language_model.model
+    )
     token_dim_1 = Dim("token_dim_1", min=2, max=llava.text_model_args.max_seq_len)
     dynamic_shapes = [{1: token_dim_1}]
     with torch.no_grad():
@@ -254,15 +247,13 @@ def export_all(llava_model: LlavaModel):
     executorch_program = lowered_and_edge.to_executorch(
         ExecutorchBackendConfig(
             extract_delegate_segments=True,
-            passes=[
-                QuantFusionPass(),
-            ],
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass={
                 "image_encoder": ConstraintBasedSymShapeEvalPass(),
                 "text_model": ConstraintBasedSymShapeEvalPass(),
                 "token_embedding": HintBasedSymShapeEvalPass(),
             },
+            do_quant_fusion_and_const_prop=True,
         )
     )
     for execution_plan in executorch_program._emitter_output.program.execution_plan: