|
24 | 24 | replace_kv_cache_with_custom_kv_cache, |
25 | 25 | ) |
26 | 26 | from executorch.examples.models.llama.source_transformation.quantize import ( |
27 | | - EmbeddingQuantHandler, |
| 27 | + get_quant_embedding_transform, |
28 | 28 | get_quant_weight_transform, |
29 | 29 | ) |
30 | 30 | from executorch.examples.models.llama.source_transformation.sdpa import ( |
|
38 | 38 | ) |
39 | 39 |
|
40 | 40 | from executorch.exir.passes import MemoryPlanningPass |
41 | | -from executorch.exir.passes.quant_fusion_pass import QuantFusionPass |
42 | 41 | from executorch.exir.passes.sym_shape_eval_pass import ( |
43 | 42 | ConstraintBasedSymShapeEvalPass, |
44 | 43 | HintBasedSymShapeEvalPass, |
@@ -184,15 +183,9 @@ def forward(self, images): |
184 | 183 |
|
185 | 184 |
|
186 | 185 | def export_token_embedding(llava, prompt): |
187 | | - def quant_embedding(model): |
188 | | - return EmbeddingQuantHandler( |
189 | | - model, |
190 | | - bitwidth=8, |
191 | | - group_size=32, |
192 | | - packed=False, |
193 | | - ).quantized_model() |
194 | | - |
195 | | - quantized_token_embed = quant_embedding(llava.model_.language_model.model) |
| 186 | + quantized_token_embed = get_quant_embedding_transform( |
| 187 | + llava.model_.language_model.model |
| 188 | + ) |
196 | 189 | token_dim_1 = Dim("token_dim_1", min=2, max=llava.text_model_args.max_seq_len) |
197 | 190 | dynamic_shapes = [{1: token_dim_1}] |
198 | 191 | with torch.no_grad(): |
@@ -254,15 +247,13 @@ def export_all(llava_model: LlavaModel): |
254 | 247 | executorch_program = lowered_and_edge.to_executorch( |
255 | 248 | ExecutorchBackendConfig( |
256 | 249 | extract_delegate_segments=True, |
257 | | - passes=[ |
258 | | - QuantFusionPass(), |
259 | | - ], |
260 | 250 | memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False), |
261 | 251 | sym_shape_eval_pass={ |
262 | 252 | "image_encoder": ConstraintBasedSymShapeEvalPass(), |
263 | 253 | "text_model": ConstraintBasedSymShapeEvalPass(), |
264 | 254 | "token_embedding": HintBasedSymShapeEvalPass(), |
265 | 255 | }, |
| 256 | + do_quant_fusion_and_const_prop=True, |
266 | 257 | ) |
267 | 258 | ) |
268 | 259 | for execution_plan in executorch_program._emitter_output.program.execution_plan: |
|
0 commit comments