up

metascroy · metascroy · commit dfff84753433 · 2025-06-24T10:23:00.000-07:00
diff --git a/examples/apple/coreml/llama/export.py b/examples/apple/coreml/llama/export.py
@@ -222,6 +222,7 @@ def main() -> None:
             ],
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
+            do_quant_fusion_and_const_prop=True,
         )
     )
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -1342,7 +1342,8 @@ def _get_source_transforms(  # noqa
         """
         transforms.append(
             get_quant_embedding_transform(
-                embedding_quantize, use_shared_embedding, checkpoint_dtype
+                embedding_quantize,
+                use_shared_embedding,
             )
         )
 
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -510,6 +510,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
             self.precision,
         )
 
+
 #########################################################################
 #####                   embedding table quantization               ######
 
@@ -734,7 +735,6 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
 def get_quant_embedding_transform(
     embedding_quantize: str,
     use_shared_embedding: bool = False,
-    dtype_override: Optional[DType] = None,
 ):
     use_torchao = embedding_quantize.startswith("torchao:")
     if use_torchao:
@@ -783,13 +783,12 @@ def _torchao_embedding_quantizer(model):
 
         return _torchao_embedding_quantizer
 
-    def _quantize_embedding(model):
+    def _embedding_quantizer(model):
         assert weight_dtype in [
             torch.int2,
             torch.int4,
             torch.int8,
         ], "Only 2, 4, or 8-bit embeddings are supported unless using torchao"
-        print("GRAN", granularity)
         quantize_(
             model,
             IntxWeightOnlyConfig(
@@ -801,7 +800,7 @@ def _quantize_embedding(model):
         )
         return model
 
-    return _quantize_embedding
+    return _embedding_quantizer
 
 
 def get_quant_weight_transform(
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -26,7 +26,6 @@
 from executorch.examples.models.llama.source_transformation.quantize import (
     get_quant_embedding_transform,
     get_quant_weight_transform,
-    EmbeddingQuantHandler,
 )
 from executorch.examples.models.llama.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
@@ -178,51 +177,9 @@ def forward(self, images):
 
 
 def export_token_embedding(llava, prompt):
-    import copy
-    model_copy = copy.deepcopy(llava.model_.language_model.model)
-    quantized_token_embed_copy = get_quant_embedding_transform("8,32")(
-        model_copy,
+    quantized_token_embed = get_quant_embedding_transform("8,32")(
+        llava.model_.language_model.model,
     )
-    def quant_embedding(model):
-        return EmbeddingQuantHandler(
-            model,
-            bitwidth=8,
-            group_size=32,
-            packed=False,
-        ).quantized_model()
-
-    quantized_token_embed = quant_embedding(llava.model_.language_model.model)
-
-    print("GET ATTRS", quantized_token_embed)
-    print("GET ATTRS2", quantized_token_embed.embed_tokens)
-
-    qval = quantized_token_embed.embed_tokens.weight
-    scale = quantized_token_embed.embed_tokens.scales
-
-    qval_copy = quantized_token_embed_copy.embed_tokens.weight.tensor_impl.get_plain()[0]
-    scale_copy = quantized_token_embed_copy.embed_tokens.weight.tensor_impl.get_plain()[1]
-    zero_copy = quantized_token_embed_copy.embed_tokens.weight.tensor_impl.get_plain()[2]
-
-    print("COPY TENSOR", quantized_token_embed_copy.embed_tokens.weight)
-    print("ORIGINAL DTYPE", quantized_token_embed.embed_tokens.dtype)
-
-    print("COMPARING")
-    print("qval_copy", qval_copy)
-    print("qval", qval)
-    print("MATCHING", (qval_copy == qval).to(torch.float32).mean())
-    print("MAX DIFF", (qval_copy.to(torch.int32) - qval.to(torch.int32)).abs().max())
-
-    print("scale_copy", scale_copy)
-    print("scale", scale)
-    print("ISCLOSE", torch.isclose(scale_copy, scale).to(torch.float32).mean())
-
-    print("zero_copy", zero_copy)
-    print("ALL ZEROS", (zero_copy == 0).to(torch.float32).mean())
-
-
-
-    
-
     token_dim_1 = Dim("token_dim_1", min=2, max=llava.text_model_args.max_seq_len)
     dynamic_shapes = [{1: token_dim_1}]
     with torch.no_grad():
@@ -232,16 +189,7 @@ def quant_embedding(model):
             dynamic_shapes=dynamic_shapes,
             strict=True,
         )
-        token_embedding_ep_copy = torch.export.export(
-            quantized_token_embed_copy.embed_tokens,
-            (prompt,),
-            dynamic_shapes=dynamic_shapes,
-            strict=True,
-        )
-    
-    print("token_embedding_ep_copy", token_embedding_ep_copy)
-    print("token_embedding_ep", token_embedding_ep)
-    return token_embedding_ep_copy
+    return token_embedding_ep
 
 
 def export_all(llava_model: LlavaModel):
@@ -302,7 +250,6 @@ def export_all(llava_model: LlavaModel):
             do_quant_fusion_and_const_prop=True,
         )
     )
-    logging.info("TOKEN EMBEDDING PROG", str(executorch_program.exported_program("token_embedding")))
     for execution_plan in executorch_program._emitter_output.program.execution_plan:
         logging.info(
             f"Required memory for activation in bytes: {execution_plan.non_const_buffer_sizes}"
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -440,6 +440,7 @@ def lowering_modules(
                 alloc_graph_output=False,
             ),
             extract_delegate_segments=True,
+            do_quant_fusion_and_const_prop=True,
         )
         with torch.no_grad():
             # backend option

Original file line number	Diff line number	Diff line change
`@@ -222,6 +222,7 @@ def main() -> None:`
`222`	`222`	`],`
`223`	`223`	`memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),`
`224`	`224`	`sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),`
	`225`	`+ do_quant_fusion_and_const_prop=True,`
`225`	`226`	`)`
`226`	`227`	`)`
`227`	`228`
Original file line number	Diff line number	Diff line change
`@@ -1342,7 +1342,8 @@ def _get_source_transforms( # noqa`
`1342`	`1342`	`"""`
`1343`	`1343`	`transforms.append(`
`1344`	`1344`	`get_quant_embedding_transform(`
`1345`		`- embedding_quantize, use_shared_embedding, checkpoint_dtype`
	`1345`	`+ embedding_quantize,`
	`1346`	`+ use_shared_embedding,`
`1346`	`1347`	`)`
`1347`	`1348`	`)`
`1348`	`1349`
Original file line number	Diff line number	Diff line change
`@@ -440,6 +440,7 @@ def lowering_modules(`
`440`	`440`	`alloc_graph_output=False,`
`441`	`441`	`),`
`442`	`442`	`extract_delegate_segments=True,`
	`443`	`+ do_quant_fusion_and_const_prop=True,`
`443`	`444`	`)`
`444`	`445`	`with torch.no_grad():`
`445`	`446`	`# backend option`