diff --git a/.ci/scripts/test_llama_lora.sh b/.ci/scripts/test_llama_lora.sh index 6337bbf76a2..80987ed6f44 100644 --- a/.ci/scripts/test_llama_lora.sh +++ b/.ci/scripts/test_llama_lora.sh @@ -107,8 +107,9 @@ $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \ model.dtype_override="fp32" \ backend.xnnpack.enabled=true \ backend.xnnpack.extended_ops=true \ - export.output_name="${MODEL_SEPARATE}.pte" \ - export.foundation_weights_file="${MODEL_SEPARATE}.ptd" + quantization.pt2e_quantize="xnnpack_dynamic" \ + export.output_name="${MODEL}.pte" \ + export.foundation_weights_file="${MODEL}.ptd" # Run llama runner. NOW=$(date +"%H:%M:%S") diff --git a/backends/test/harness/stages/__init__.py b/backends/test/harness/stages/__init__.py index 36ed435ebd7..14431191621 100644 --- a/backends/test/harness/stages/__init__.py +++ b/backends/test/harness/stages/__init__.py @@ -1,6 +1,6 @@ from .export import Export from .partition import Partition -from .quantize import Quantize +from .quantize import Quantize, Quantize_ from .run_passes import RunPasses from .serialize import Serialize from .stage import Stage, StageType @@ -12,6 +12,7 @@ "Export", "Partition", "Quantize", + "Quantize_", "RunPasses", "Serialize", "Stage", diff --git a/backends/test/harness/stages/quantize.py b/backends/test/harness/stages/quantize.py index 9edb600e19f..bc83ba51200 100644 --- a/backends/test/harness/stages/quantize.py +++ b/backends/test/harness/stages/quantize.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Sequence, Tuple +from typing import Any, Callable, Optional, Sequence, Tuple import torch @@ -15,6 +15,16 @@ prepare_qat_pt2e, ) from torchao.quantization.pt2e.quantizer import Quantizer +from torchao.quantization.quant_api import quantize_ +from torchao.utils import unwrap_tensor_subclass + +from torchao.quantization.quant_api import ( + Int8DynamicActivationIntxWeightConfig, + IntxWeightOnlyConfig, + quantize_, +) + +from torchao.utils import unwrap_tensor_subclass class Quantize(Stage): @@ -79,3 +89,48 @@ def graph_module(self) -> str: def run_artifact(self, inputs): return self.converted_graph.forward(*inputs) + + +class Quantize_(Stage): + """ + TorchAO quantization stage using the quantize_ API. + """ + + def __init__( + self, + config: Any, + filter_fn: Optional[Callable[[torch.nn.Module, str], bool]] = None, + ): + """ + Args: + config: TorchAO quantization config (e.g., Int4WeightOnlyConfig, Int8DynamicActivationInt8WeightConfig) + filter_fn: Optional filter function to select which modules to quantize + """ + self.config = config + self.filter_fn = filter_fn + self.quantized_module = None + + def stage_type(self) -> str: + return StageType.QUANTIZE + + def run( + self, artifact: torch.nn.Module, inputs: Optional[Tuple[torch.Tensor]] + ) -> None: + # Apply quantize_ to the model + quantize_(artifact, self.config, self.filter_fn) + + # Unwrap tensor subclasses for export compatibility + unwrap_tensor_subclass(artifact) + + self.quantized_module = artifact + + @property + def artifact(self) -> torch.nn.Module: + return self.quantized_module + + @property + def graph_module(self) -> torch.nn.Module: + return self.quantized_module + + def run_artifact(self, inputs): + return self.quantized_module.forward(*inputs) diff --git a/backends/test/harness/tester.py b/backends/test/harness/tester.py index 351bab4a605..8588dfb2fa0 100644 --- a/backends/test/harness/tester.py +++ b/backends/test/harness/tester.py @@ -9,6 +9,7 @@ Export, Partition, Quantize, + Quantize_, RunPasses, Serialize, Stage, diff --git a/backends/xnnpack/_passes/__init__.py b/backends/xnnpack/_passes/__init__.py index 141718bde6f..c48896b3d81 100644 --- a/backends/xnnpack/_passes/__init__.py +++ b/backends/xnnpack/_passes/__init__.py @@ -23,6 +23,9 @@ from executorch.backends.xnnpack._passes.fuse_activation_pass import FuseActivationPass from executorch.backends.xnnpack._passes.fuse_batch_norm import FuseBatchNormPass from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass +from executorch.backends.xnnpack._passes.propagate_custom_meta_pass import ( + PropagateCustomMetaPass, +) from executorch.backends.xnnpack._passes.remove_redundant_copy_pass import ( RemoveRedundantCopyPass, ) @@ -59,6 +62,7 @@ def __init__( DimOrderOpsRevertPass, ConvertToUpsampleBilinear2d, ConvertToLinearPass, + PropagateCustomMetaPass, ConvertToSDPAPass, ConstPropPass, FuseBatchNormPass, diff --git a/backends/xnnpack/_passes/propagate_custom_meta_pass.py b/backends/xnnpack/_passes/propagate_custom_meta_pass.py new file mode 100644 index 00000000000..f39174b4112 --- /dev/null +++ b/backends/xnnpack/_passes/propagate_custom_meta_pass.py @@ -0,0 +1,45 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass +from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant +from executorch.exir.pass_base import PassResult + + +class PropagateCustomMetaPass(XNNPACKPass): + """ + Pass to propagate node.meta['custom'] from parent nodes to their q/dq child nodes. + For all quantize/dequantize nodes in the graph, if the parent node has a + node.meta['custom'] entry, this pass will copy that value to the q/dq node's meta. + """ + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + + for node in graph.nodes: + if not (is_quant(node) or is_dequant(node)): + continue + + # Get the parent node (first input argument) + if len(node.all_input_nodes) == 0: + continue + + parent_node = node.args[0] + if not isinstance(parent_node, torch.fx.Node): + continue + + if "custom" in parent_node.meta: + print(f"PROPAGATING CUSTOM META FROM {parent_node.name} TO {node.name}") + node.meta["custom"] = parent_node.meta["custom"] + + graph_module.recompile() + + # Since we are overriding "call", we need to call the parent's "call" + # to retrace the graph and regenerate metadata + graph_module = super().call(graph_module).graph_module + + return PassResult(graph_module, True) diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py index 68226644859..6f3e1c68ab9 100644 --- a/backends/xnnpack/operators/node_visitor.py +++ b/backends/xnnpack/operators/node_visitor.py @@ -296,6 +296,7 @@ def get_quant_params( offset=UINT64_MAX, size=num_bytes, named_key=scale_name ) ) + print(f"NDM: adding scale tensor with key {scale_name}") self._named_data_store.add_named_data( scale_name, bytes(scale_array), CONSTANT_TENSOR_ALIGNMENT ) @@ -630,6 +631,7 @@ def get_serialized_buffer_index( logging.info( f"Adding constant data with name {tensor.name}, key {named_key} and external_tag {external_tag} to named_data_store" ) + print(f"NDM: Adding constant data with name {tensor.name}, key {named_key} and tag {external_tag}") self._named_data_store.add_named_data( named_key, bytes(array), diff --git a/backends/xnnpack/test/passes/test_propagate_custom_meta_pass.py b/backends/xnnpack/test/passes/test_propagate_custom_meta_pass.py new file mode 100644 index 00000000000..94035e278f8 --- /dev/null +++ b/backends/xnnpack/test/passes/test_propagate_custom_meta_pass.py @@ -0,0 +1,158 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple, Union + +import torch +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, +) +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import ( + get_symmetric_quantization_config, +) +from executorch.backends.xnnpack.test.tester import Quantize as XNNPackQuantize, Tester +from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower +from executorch.exir.passes.external_constants_pass import ( + delegate_external_constants_pass_unlifted, +) + +from torchao.quantization.granularity import PerGroup +from torchao.quantization.quant_api import Int8DynamicActivationIntxWeightConfig + +try: + import executorch.extension.pybindings.portable_lib # noqa[F401] + import executorch.kernels.quantized # noqa[F401] + + has_quantized_ops = True +except: + has_quantized_ops = False + print("Missing quantized ops") + +class TestPropagateCustomMetaPass(unittest.TestCase): + class ModuleLinear(torch.nn.Module): + def __init__( + self, + in_size: int = 2, + input_channels: int = 4, + output_channels: int = 4, + dtype: torch.dtype = torch.float, + use_bias: bool = False, + ): + super().__init__() + self.linear = torch.nn.Linear( + input_channels, output_channels, bias=use_bias + ).to(dtype=dtype) + + self.ic = input_channels + self.oc = output_channels + assert dtype in [torch.float, torch.half], "Unsupported op dtype" + self.op_dtype = dtype + self.in_size = in_size + + def forward(self, x: torch.Tensor): + return self.linear(x) + + def get_random_inputs(self): + inp = torch.randn(self.in_size, self.ic).to(self.op_dtype) + return (inp,) + + class Export(BaseStages.Export): + def run( + self, + artifact: torch.nn.Module, + inputs: Tuple[torch.Tensor], + ) -> None: + + tagged_module = torch.export.export( + artifact, inputs, dynamic_shapes=self.dynamic_shapes, strict=True + ).module() + delegate_external_constants_pass_unlifted( + module=tagged_module, + gen_tag_fn=lambda x: "model", # This is the filename the weights will be saved to. In this case, weights will be saved as "model.ptd" + ) + self.exported_program = torch.export.export( + tagged_module, inputs, dynamic_shapes=self.dynamic_shapes, strict=True + ) + + def _test_linear( + self, + partitioner: XnnpackPartitioner, + quantization_stage: Union[BaseStages.Quantize, BaseStages.Quantize_] + ): + eager_model = self.ModuleLinear( + in_size=1, + input_channels=32, + output_channels=2, + ) + test_inputs = eager_model.get_random_inputs() + + tester = Tester(eager_model, test_inputs) + tester.quantize(quantization_stage) + tester.export(self.Export()) + tester.to_edge_transform_and_lower( + ToEdgeTransformAndLower([partitioner]) + ).to_executorch() + tester.run_method_and_compare_outputs() + + exec = tester.get_artifact() + program_buffer = exec.buffer + self.assertEqual(len(exec._tensor_data), 1) + data_buffer = bytes(exec._tensor_data.pop("model")) + self.assertTrue(len(data_buffer) > 200) + from executorch.extension.pybindings import portable_lib as runtime + + module = runtime._load_for_executorch_from_buffer(program_buffer, data_buffer) + output = module.forward(test_inputs) + reference_output = exec.exported_program().module()( + test_inputs[0], + ) + self.assertTrue(torch.allclose(output[0], reference_output)) + + # TODO(lfq): This fails correctly, but segmentation faults after a few runs. + # with self.assertRaises(RuntimeError): + # runtime._load_for_executorch_from_buffer(program_buffer).forward( + # test_inputs + # ) + + def test_quantize_(self): + # Quantize with torchao quantize_ API. + DynamicallyQuantizedPartitioner = XnnpackPartitioner( + config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, + per_op_mode=False, + ) + linear_config = Int8DynamicActivationIntxWeightConfig( + weight_dtype=torch.int4, + weight_granularity=PerGroup(32), + ) + self._test_linear( + DynamicallyQuantizedPartitioner, + BaseStages.Quantize_(config=linear_config)) + + def test_pt2e_quantize(self): + # Quantize with pt2e quantize. + quant_configs = [ + # per_channel + get_symmetric_quantization_config(is_per_channel=True, is_dynamic=False), + # per_tensor + get_symmetric_quantization_config(is_per_channel=False, is_dynamic=False), + # per_channel_dynamic + get_symmetric_quantization_config(is_per_channel=True, is_dynamic=True), + ] + partitioners = [] + for config_precision in [ConfigPrecisionType.STATIC_QUANT, ConfigPrecisionType.DYNAMIC_QUANT]: + for per_op_mode in [True, False]: + partitioners.append( + XnnpackPartitioner( + config_precisions=config_precision, + per_op_mode=per_op_mode, + ) + ) + for quant_config in quant_configs: + for partitioner in partitioners: + self._test_linear(partitioner, XNNPackQuantize(quantization_config=quant_config)) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index aa3b157c8da..523d7622e7a 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -854,6 +854,7 @@ def _to_edge_and_lower_llama_xnnpack( xnnpack_extended_ops: bool = False, generate_etrecord: bool = False, verbose: bool = False, + gen_tag_fn: Optional[Callable[[torch.fx.Node], Optional[str]]] = None, ) -> LLMEdgeManager: # noqa: C901 partitioners = [] @@ -876,9 +877,22 @@ def _to_edge_and_lower_llama_xnnpack( if generate_etrecord: builder_exported.generate_etrecord = True - builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower( + builder = builder_exported.pt2e_quantize(quantizers) + from executorch.exir.passes.external_constants_pass import ( + delegate_external_constants_pass_unlifted, + ) + assert ( + builder_exported.pre_autograd_graph_module is not None + ), "pre_autograd_graph_module shouldn't be None here" + delegate_external_constants_pass_unlifted( + module=builder_exported.pre_autograd_graph_module, + gen_tag_fn=gen_tag_fn, + ) + + builder = builder.to_edge_transform_and_lower( partitioners ) + if verbose: print_delegation_info(builder.edge_manager.exported_program().graph_module) @@ -1088,6 +1102,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 llm_config.backend.xnnpack.enabled = True if llm_config.backend.xnnpack.enabled: + gen_tag_fn = None if llm_config.export.foundation_weights_file is not None: gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: ( llm_config.export.foundation_weights_file @@ -1096,17 +1111,17 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 ) from executorch.exir.passes.external_constants_pass import ( - delegate_external_constants_pass_unlifted, + # delegate_external_constants_pass_unlifted, external_constants_pass, ) - assert ( - builder_exported.pre_autograd_graph_module is not None - ), "pre_autograd_graph_module shouldn't be None here" - delegate_external_constants_pass_unlifted( - module=builder_exported.pre_autograd_graph_module, - gen_tag_fn=gen_tag_fn, - ) + # assert ( + # builder_exported.pre_autograd_graph_module is not None + # ), "pre_autograd_graph_module shouldn't be None here" + # delegate_external_constants_pass_unlifted( + # module=builder_exported.pre_autograd_graph_module, + # gen_tag_fn=gen_tag_fn, + # ) # Also add a pass for 'to_executorch' to tag weights that aren't delegated. additional_passes.append( @@ -1123,6 +1138,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901 xnnpack_extended_ops=llm_config.backend.xnnpack.extended_ops, generate_etrecord=llm_config.debug.generate_etrecord, verbose=llm_config.debug.verbose, + gen_tag_fn=gen_tag_fn, ) else: builder = _to_edge_and_lower_llama( diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py index 15e0b23d36f..a44a52698aa 100644 --- a/exir/emit/_emitter.py +++ b/exir/emit/_emitter.py @@ -387,13 +387,15 @@ def _save_new_const_tensor( # Update buffer_idx to point to the end of the list where we are adding the new buffer. buffer = Buffer(storage=buffer_data) - # Tensor is stored outside of the PTE file. if ( spec.extra_tensor_info is not None and spec.extra_tensor_info.fully_qualified_name is not None and spec.extra_tensor_info.location == TensorDataLocation.EXTERNAL ): + print(f"EXTERNAL CONSTANT {spec.extra_tensor_info.fully_qualified_name}") + if spec.extra_tensor_info.fully_qualified_name == "_tensor_constant_2": + breakpoint() assert ( constant_tag is not None ), "Constant tag is not set for external tensor" @@ -466,9 +468,15 @@ def _tensor_spec_to_evalue( and spec.extra_tensor_info.location == TensorDataLocation.EXTERNAL ): buffer_idx = self.program_state.external_constant_hash.get(hashed, -1) + if buffer_idx != -1: + # Save the constant tag for the external tensor + if constant_tag not in self.program_state.external_constant_map: + self.program_state.external_constant_map[constant_tag] = {} + self.program_state.external_constant_map[constant_tag][ + spec.extra_tensor_info.fully_qualified_name # pyre-ignore Undefined attribute [16]: `Optional` has no attribute `fully_qualified_name`. + ] = buffer_idx else: buffer_idx = self.program_state.cached_spec_hash_values.get(hashed, -1) - # Haven't seen this constant before. if buffer_idx == -1: buffer_idx = self._save_new_const_tensor( @@ -1645,18 +1653,23 @@ def _is_buffer(node: Node, graph_signature: ExportGraphSignature) -> bool: # suggest that the same abstract buffer is mutable in another entry point so we should # compel it to be considered mutable in all entry points at emission just as the user did with # memory planning. - is_mutable_buffer |= ( - _is_buffer(self.node, self.exported_program.graph_signature) - and spec.mem_id is not None - and spec.mem_offset is not None - ) - + # is_mutable_buffer |= ( + # _is_buffer(self.node, self.exported_program.graph_signature) + # and spec.mem_id is not None + # and spec.mem_offset is not None + # ) + # if fqn is not None: + # print(f"Node {fqn} is mutable buffer: {is_mutable_buffer}, with cnstant_tag {constant_tag}") + # If the placeholder has a constant_tag, it is external to the PTE file # and requires a fqn and location=TensorDataLocation.EXTERNAL if constant_tag is not None: assert ( fqn is not None ), "constant tagged tensors require a fully qualified name" + + if fqn == "_tensor_constant_2": + breakpoint() if spec.extra_tensor_info is None: spec.extra_tensor_info = ExtraTensorInfo( fully_qualified_name=fqn, location=TensorDataLocation.EXTERNAL @@ -1666,8 +1679,10 @@ def _is_buffer(node: Node, graph_signature: ExportGraphSignature) -> bool: spec.extra_tensor_info.location = TensorDataLocation.EXTERNAL if is_mutable_buffer: + print("MUTABLE_BUFFE: ", fqn, spec.mem_id, spec.mem_offset) # Emit names if we are supposed to. if self.emitter_state.emit_mutable_buffer_names: + breakpoint() if spec.extra_tensor_info is None: spec.extra_tensor_info = ExtraTensorInfo( fully_qualified_name=fqn, @@ -1675,6 +1690,7 @@ def _is_buffer(node: Node, graph_signature: ExportGraphSignature) -> bool: ) else: spec.extra_tensor_info.fully_qualified_name = fqn + spec.extra_tensor_info.location = TensorDataLocation.SEGMENT # if We aren't emitting the name then it needs to be memory planned. elif spec.mem_id is None or spec.mem_offset is None: raise InternalError( diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index ccb88a03818..4b57c00e7cf 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -394,7 +394,8 @@ Error Method::parse_external_constants(const NamedDataMap* external_data_map) { ET_CHECK_OR_RETURN_ERROR( buffer.ok(), InvalidExternalData, - "Buffer retrieved from get_data is not valid"); + "Buffer retrieved from get_data is not valid, error: %zu", + buffer.error()); new (&external_constants_[n_external_constants_].buffer) FreeableBuffer(std::move(buffer.get()));