Decompose after export in export_llama (#15951)

lucylq · facebook-github-bot · commit 22a9e78a0b72 · 2025-12-02T10:55:39.000-08:00
Summary:
`unwrap_tensor_subclass` was not unwrapping nested lora linears. This meant qdata/scale/zero were bundled together in the subclass, and separated at run decompositions inside to_edge_transform_and_lower. This is after nodes are tagged, meaning that the scales were not tagged, and remained in the PTE file after the rest of the weights were moved to a PTD file.

It's recommended to move away from `unwrap_tensor_subclass` and rely on export + decomps. This PR adds a decomp after exporting in export_llama, and removes cases of `unwrap_tensor_subclass`. 

TODO: remove all cases of `unwrap_tensor_subclass` in ET.


Test Plan:
Add check that quantized weights are in PTD file (not PTE file) after quantization. This is a simple check, nested linears seem to be the real issue that decomposing resolves. TODO to add a test for that (probably e2e test with stories in subsequent PR)
```
 python -m unittest executorch.backends.xnnpack.test.passes.test_propagate_custom_meta_pass
```

Reviewed By: metascroy

Differential Revision: D87826410

Pulled By: lucylq
diff --git a/backends/xnnpack/test/passes/test_propagate_custom_meta_pass.py b/backends/xnnpack/test/passes/test_propagate_custom_meta_pass.py
@@ -20,9 +20,15 @@
 )
 from executorch.backends.xnnpack.test.tester import Quantize as XNNPackQuantize, Tester
 from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
+
+from executorch.exir import ExecutorchProgramManager
+from executorch.exir._serialize import _deserialize_pte_binary
 from executorch.exir.passes.external_constants_pass import (
     delegate_external_constants_pass_unlifted,
 )
+from executorch.extension.flat_tensor.serialize.serialize import (
+    _deserialize_to_flat_tensor,
+)
 
 from torchao.quantization.granularity import PerGroup
 from torchao.quantization.quant_api import Int8DynamicActivationIntxWeightConfig
@@ -87,7 +93,7 @@ def _test_linear(
         self,
         partitioner: XnnpackPartitioner,
         quantization_stage: Union[BaseStages.Quantize, BaseStages.Quantize_],
-    ):
+    ) -> ExecutorchProgramManager:
         eager_model = self.ModuleLinear(
             in_size=1,
             input_channels=32,
@@ -106,7 +112,7 @@ def _test_linear(
         exec = tester.get_artifact()
         program_buffer = exec.buffer
         self.assertEqual(len(exec._tensor_data), 1)
-        data_buffer = bytes(exec._tensor_data.pop("model"))
+        data_buffer = bytes(exec._tensor_data["model"])
         self.assertTrue(len(data_buffer) > 200)
         from executorch.extension.pybindings import portable_lib as runtime
 
@@ -122,6 +128,8 @@ def _test_linear(
         #         test_inputs
         #     )
 
+        return exec
+
     def test_quantize_(self):
         # Quantize with torchao quantize_ API.
         DynamicallyQuantizedPartitioner = XnnpackPartitioner(
@@ -132,9 +140,16 @@ def test_quantize_(self):
             weight_dtype=torch.int4,
             weight_granularity=PerGroup(32),
         )
-        self._test_linear(
+        exec = self._test_linear(
             DynamicallyQuantizedPartitioner, BaseStages.Quantize_(config=linear_config)
         )
+        # PTE file has no named data.
+        pte_file = _deserialize_pte_binary(exec.buffer)
+        self.assertEqual(pte_file.named_data, None)
+
+        # PTD file contains quantized weight and scale.
+        ptd_file = _deserialize_to_flat_tensor(bytes(exec._tensor_data["model"]))
+        self.assertEqual(len(ptd_file.named_data), 2)
 
     def test_pt2e_quantize(self):
         # Quantize with pt2e quantize.
@@ -156,6 +171,15 @@ def test_pt2e_quantize(self):
                 partitioner = XnnpackPartitioner(
                     config_precisions=precision, per_op_mode=per_op_mode
                 )
-                self._test_linear(
+                exec = self._test_linear(
                     partitioner, XNNPackQuantize(quantization_config=quant_config)
                 )
+                # PTE file has no named data.
+                pte_file = _deserialize_pte_binary(exec.buffer)
+                self.assertEqual(pte_file.named_data, None)
+
+                # PTD file contains quantized weight, and potentially scale.
+                ptd_file = _deserialize_to_flat_tensor(
+                    bytes(exec._tensor_data["model"])
+                )
+                self.assertTrue(len(ptd_file.named_data) >= 1)
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -194,9 +194,6 @@ def filter_fn(m, fqn):
             ),
             filter_fn=filter_fn,
         )
-
-        model = unwrap_tensor_subclass(model)
-
         # TODO: deal with checkpoint / computation dtype decoupling.
 
         if verbose:
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -38,7 +38,6 @@
 from torch.nn.attention import SDPBackend
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torchao.quantization.pt2e.quantizer import ComposableQuantizer, Quantizer
-from torchao.utils import unwrap_tensor_subclass
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -203,11 +202,6 @@ def _get_edge_config(self) -> EdgeCompileConfig:
         return edge_config
 
     def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram:
-        if module is not None:
-            unwrap_tensor_subclass(module)
-        else:
-            unwrap_tensor_subclass(self.model)
-
         dynamic_shape = self._get_dynamic_shape()
         # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
         # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
@@ -226,6 +220,12 @@ def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram:
                 dynamic_shapes=dynamic_shape,
                 strict=True,
             )
+        # Functionalize the graph, and decompose subclasses from torchao quantize.
+        from executorch.exir.tracer import _default_decomposition_table
+
+        exported_module = exported_module.run_decompositions(
+            _default_decomposition_table()
+        )
         return exported_module
 
     def export(self) -> "LLMEdgeManager":

Original file line number	Diff line number	Diff line change
`@@ -194,9 +194,6 @@ def filter_fn(m, fqn):`
`194`	`194`	`),`
`195`	`195`	`filter_fn=filter_fn,`
`196`	`196`	`)`
`197`		`-`
`198`		`- model = unwrap_tensor_subclass(model)`
`199`		`-`
`200`	`197`	`# TODO: deal with checkpoint / computation dtype decoupling.`
`201`	`198`
`202`	`199`	`if verbose:`