generate quantized lora

lucylq · lucylq · commit 614dd4232f55 · 2025-10-09T08:40:03.000-07:00
diff --git a/.ci/scripts/test_llama_lora.sh b/.ci/scripts/test_llama_lora.sh
@@ -107,8 +107,9 @@ $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
     model.dtype_override="fp32" \
     backend.xnnpack.enabled=true \
     backend.xnnpack.extended_ops=true \
-    export.output_name="${MODEL_SEPARATE}.pte" \
-    export.foundation_weights_file="${MODEL_SEPARATE}.ptd"
+    quantization.pt2e_quantize="xnnpack_dynamic" \
+    export.output_name="${MODEL}.pte" \
+    export.foundation_weights_file="${MODEL}.ptd"
 
 # Run llama runner.
 NOW=$(date +"%H:%M:%S")
diff --git a/backends/xnnpack/_passes/propagate_custom_meta_pass.py b/backends/xnnpack/_passes/propagate_custom_meta_pass.py
@@ -33,6 +33,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                 continue
 
             if "custom" in parent_node.meta:
+                print(f"PROPAGATING CUSTOM META FROM {parent_node.name} TO {node.name}")
                 node.meta["custom"] = parent_node.meta["custom"]
 
         graph_module.recompile()
diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py
@@ -296,6 +296,7 @@ def get_quant_params(
                     offset=UINT64_MAX, size=num_bytes, named_key=scale_name
                 )
             )
+            print(f"NDM: adding scale tensor with key {scale_name}")
             self._named_data_store.add_named_data(
                 scale_name, bytes(scale_array), CONSTANT_TENSOR_ALIGNMENT
             )
@@ -630,6 +631,7 @@ def get_serialized_buffer_index(
             logging.info(
                 f"Adding constant data with name {tensor.name}, key {named_key} and external_tag {external_tag} to named_data_store"
             )
+        print(f"NDM: Adding constant data with name {tensor.name}, key {named_key} and tag {external_tag}")
         self._named_data_store.add_named_data(
             named_key,
             bytes(array),
diff --git a/backends/xnnpack/test/passes/test_propagate_custom_meta_pass.py b/backends/xnnpack/test/passes/test_propagate_custom_meta_pass.py
@@ -8,32 +8,30 @@
 
 from typing import Callable, Optional, Tuple, Union
 
-import executorch.backends.test.harness.stages as BaseStages
-
 import torch
 from executorch.backends.test.harness.stages import StageType
+from executorch.backends.xnnpack.test.tester import RunPasses, Tester
+from executorch.backends.xnnpack.test.tester import Quantize as XNNPackQuantize
+import executorch.backends.test.harness.stages as BaseStages
+from executorch.backends.xnnpack.test.tester.tester import (
+    ToEdgeTransformAndLower,
+)
 from executorch.backends.xnnpack.partition.config.xnnpack_config import (
     ConfigPrecisionType,
 )
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
     XnnpackFloatingPointPartitioner,
     XnnpackPartitioner,
 )
-
-from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
-    get_symmetric_quantization_config,
-)
-from executorch.backends.xnnpack.test.tester import (
-    Quantize as XNNPackQuantize,
-    RunPasses,
-    Tester,
-)
-from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
 from executorch.exir import (
     EdgeCompileConfig,
     ExecutorchBackendConfig,
     to_edge_transform_and_lower,
 )
+
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+)
 from executorch.exir.passes.external_constants_pass import (
     delegate_external_constants_pass_unlifted,
 )
@@ -56,7 +54,6 @@
     has_quantized_ops = False
     print("Missing quantized ops")
 
-
 class TestPropagateCustomMetaPass(unittest.TestCase):
     class ModuleLinear(torch.nn.Module):
         def __init__(
@@ -99,14 +96,12 @@ def run(
                 module=tagged_module,
                 gen_tag_fn=lambda x: "model",  # This is the filename the weights will be saved to. In this case, weights will be saved as "model.ptd"
             )
-            self.exported_program = export(
-                tagged_module, inputs, dynamic_shapes=self.dynamic_shapes, strict=True
-            )
+            self.exported_program = export(tagged_module, inputs, dynamic_shapes=self.dynamic_shapes, strict=True)
 
     def _test_linear(
         self,
         partitioner: XnnpackPartitioner,
-        quantization_stage: Union[BaseStages.Quantize, BaseStages.Quantize_],
+        quantization_stage: Union[BaseStages.Quantize, BaseStages.Quantize_]
     ):
         eager_model = self.ModuleLinear(
             in_size=1,
@@ -143,8 +138,8 @@ def test_quantize_(self):
             weight_granularity=PerGroup(32),
         )
         self._test_linear(
-            DynamicallyQuantizedPartitioner, BaseStages.Quantize_(config=linear_config)
-        )
+            DynamicallyQuantizedPartitioner,
+            BaseStages.Quantize_(config=linear_config))
 
     def test_pt2e_quantize(self):
         # Quantize with pt2e quantize.
@@ -157,10 +152,7 @@ def test_pt2e_quantize(self):
             get_symmetric_quantization_config(is_per_channel=True, is_dynamic=True),
         ]
         partitioners = []
-        for config_precision in [
-            ConfigPrecisionType.STATIC_QUANT,
-            ConfigPrecisionType.DYNAMIC_QUANT,
-        ]:
+        for config_precision in [ConfigPrecisionType.STATIC_QUANT, ConfigPrecisionType.DYNAMIC_QUANT]:
             for per_op_mode in [True, False]:
                 partitioners.append(
                     XnnpackPartitioner(
@@ -170,6 +162,4 @@ def test_pt2e_quantize(self):
                 )
         for quant_config in quant_configs:
             for partitioner in partitioners:
-                self._test_linear(
-                    partitioner, XNNPackQuantize(quantization_config=quant_config)
-                )
+                self._test_linear(partitioner, XNNPackQuantize(quantization_config=quant_config))
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -854,6 +854,7 @@ def _to_edge_and_lower_llama_xnnpack(
     xnnpack_extended_ops: bool = False,
     generate_etrecord: bool = False,
     verbose: bool = False,
+    gen_tag_fn: Optional[Callable[[torch.fx.Node], Optional[str]]] = None,
 ) -> LLMEdgeManager:  # noqa: C901
     partitioners = []
 
@@ -876,9 +877,22 @@ def _to_edge_and_lower_llama_xnnpack(
     if generate_etrecord:
         builder_exported.generate_etrecord = True
 
-    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
+    builder = builder_exported.pt2e_quantize(quantizers)
+    from executorch.exir.passes.external_constants_pass import (
+        delegate_external_constants_pass_unlifted,
+    )
+    assert (
+        builder_exported.pre_autograd_graph_module is not None
+    ), "pre_autograd_graph_module shouldn't be None here"
+    delegate_external_constants_pass_unlifted(
+        module=builder_exported.pre_autograd_graph_module,
+        gen_tag_fn=gen_tag_fn,
+    )
+    
+    builder = builder.to_edge_transform_and_lower(
         partitioners
     )
+
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
 
@@ -1088,6 +1102,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
         llm_config.backend.xnnpack.enabled = True
 
     if llm_config.backend.xnnpack.enabled:
+        gen_tag_fn = None
         if llm_config.export.foundation_weights_file is not None:
             gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
                 llm_config.export.foundation_weights_file
@@ -1096,17 +1111,17 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             )
 
             from executorch.exir.passes.external_constants_pass import (
-                delegate_external_constants_pass_unlifted,
+                # delegate_external_constants_pass_unlifted,
                 external_constants_pass,
             )
 
-            assert (
-                builder_exported.pre_autograd_graph_module is not None
-            ), "pre_autograd_graph_module shouldn't be None here"
-            delegate_external_constants_pass_unlifted(
-                module=builder_exported.pre_autograd_graph_module,
-                gen_tag_fn=gen_tag_fn,
-            )
+            # assert (
+            #     builder_exported.pre_autograd_graph_module is not None
+            # ), "pre_autograd_graph_module shouldn't be None here"
+            # delegate_external_constants_pass_unlifted(
+            #     module=builder_exported.pre_autograd_graph_module,
+            #     gen_tag_fn=gen_tag_fn,
+            # )
 
             # Also add a pass for 'to_executorch' to tag weights that aren't delegated.
             additional_passes.append(
@@ -1123,6 +1138,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             xnnpack_extended_ops=llm_config.backend.xnnpack.extended_ops,
             generate_etrecord=llm_config.debug.generate_etrecord,
             verbose=llm_config.debug.verbose,
+            gen_tag_fn=gen_tag_fn,
         )
     else:
         builder = _to_edge_and_lower_llama(
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
@@ -387,13 +387,15 @@ def _save_new_const_tensor(
 
         # Update buffer_idx to point to the end of the list where we are adding the new buffer.
         buffer = Buffer(storage=buffer_data)
-
         # Tensor is stored outside of the PTE file.
         if (
             spec.extra_tensor_info is not None
             and spec.extra_tensor_info.fully_qualified_name is not None
             and spec.extra_tensor_info.location == TensorDataLocation.EXTERNAL
         ):
+            print(f"EXTERNAL CONSTANT {spec.extra_tensor_info.fully_qualified_name}")
+            if spec.extra_tensor_info.fully_qualified_name == "_tensor_constant_2":
+                breakpoint()
             assert (
                 constant_tag is not None
             ), "Constant tag is not set for external tensor"
@@ -466,9 +468,15 @@ def _tensor_spec_to_evalue(
                 and spec.extra_tensor_info.location == TensorDataLocation.EXTERNAL
             ):
                 buffer_idx = self.program_state.external_constant_hash.get(hashed, -1)
+                if buffer_idx != -1:
+                    # Save the constant tag for the external tensor
+                    if constant_tag not in self.program_state.external_constant_map:
+                        self.program_state.external_constant_map[constant_tag] = {}
+                    self.program_state.external_constant_map[constant_tag][
+                        spec.extra_tensor_info.fully_qualified_name  # pyre-ignore Undefined attribute [16]: `Optional` has no attribute `fully_qualified_name`.
+                    ] = buffer_idx
             else:
                 buffer_idx = self.program_state.cached_spec_hash_values.get(hashed, -1)
-
             # Haven't seen this constant before.
             if buffer_idx == -1:
                 buffer_idx = self._save_new_const_tensor(
@@ -1645,18 +1653,23 @@ def _is_buffer(node: Node, graph_signature: ExportGraphSignature) -> bool:
             # suggest that the same abstract buffer is mutable in another entry point so we should
             # compel it to be considered mutable in all entry points at emission just as the user did with
             # memory planning.
-            is_mutable_buffer |= (
-                _is_buffer(self.node, self.exported_program.graph_signature)
-                and spec.mem_id is not None
-                and spec.mem_offset is not None
-            )
-
+            # is_mutable_buffer |= (
+            #     _is_buffer(self.node, self.exported_program.graph_signature)
+            #     and spec.mem_id is not None
+            #     and spec.mem_offset is not None
+            # )
+            # if fqn is not None:
+            #     print(f"Node {fqn} is mutable buffer: {is_mutable_buffer}, with cnstant_tag {constant_tag}")
+    
             # If the placeholder has a constant_tag, it is external to the PTE file
             # and requires a fqn and location=TensorDataLocation.EXTERNAL
             if constant_tag is not None:
                 assert (
                     fqn is not None
                 ), "constant tagged tensors require a fully qualified name"
+
+                if fqn == "_tensor_constant_2":
+                    breakpoint()
                 if spec.extra_tensor_info is None:
                     spec.extra_tensor_info = ExtraTensorInfo(
                         fully_qualified_name=fqn, location=TensorDataLocation.EXTERNAL
@@ -1666,15 +1679,18 @@ def _is_buffer(node: Node, graph_signature: ExportGraphSignature) -> bool:
                     spec.extra_tensor_info.location = TensorDataLocation.EXTERNAL
 
             if is_mutable_buffer:
+                print("MUTABLE_BUFFE: ", fqn, spec.mem_id, spec.mem_offset)
                 # Emit names if we are supposed to.
                 if self.emitter_state.emit_mutable_buffer_names:
+                    breakpoint()
                     if spec.extra_tensor_info is None:
                         spec.extra_tensor_info = ExtraTensorInfo(
                             fully_qualified_name=fqn,
                             location=TensorDataLocation.SEGMENT,
                         )
                     else:
                         spec.extra_tensor_info.fully_qualified_name = fqn
+                        spec.extra_tensor_info.location = TensorDataLocation.SEGMENT
                 # if We aren't emitting the name then it needs to be memory planned.
                 elif spec.mem_id is None or spec.mem_offset is None:
                     raise InternalError(
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
@@ -394,7 +394,8 @@ Error Method::parse_external_constants(const NamedDataMap* external_data_map) {
     ET_CHECK_OR_RETURN_ERROR(
         buffer.ok(),
         InvalidExternalData,
-        "Buffer retrieved from get_data is not valid");
+        "Buffer retrieved from get_data is not valid, error: %zu",
+        buffer.error());
     new (&external_constants_[n_external_constants_].buffer)
         FreeableBuffer(std::move(buffer.get()));
 

Original file line number	Diff line number	Diff line change
`@@ -296,6 +296,7 @@ def get_quant_params(`
`296`	`296`	`offset=UINT64_MAX, size=num_bytes, named_key=scale_name`
`297`	`297`	`)`
`298`	`298`	`)`
	`299`	`+ print(f"NDM: adding scale tensor with key {scale_name}")`
`299`	`300`	`self._named_data_store.add_named_data(`
`300`	`301`	`scale_name, bytes(scale_array), CONSTANT_TENSOR_ALIGNMENT`
`301`	`302`	`)`
`@@ -630,6 +631,7 @@ def get_serialized_buffer_index(`
`630`	`631`	`logging.info(`
`631`	`632`	`f"Adding constant data with name {tensor.name}, key {named_key} and external_tag {external_tag} to named_data_store"`
`632`	`633`	`)`
	`634`	`+ print(f"NDM: Adding constant data with name {tensor.name}, key {named_key} and tag {external_tag}")`
`633`	`635`	`self._named_data_store.add_named_data(`
`634`	`636`	`named_key,`
`635`	`637`	`bytes(array),`