test xnnpack quant with program-data separation

lucylq · lucylq · commit 7336a78d1ea5 · 2025-10-01T17:02:58.000-07:00
diff --git a/export_quant_xnnpack.py b/export_quant_xnnpack.py
@@ -0,0 +1,137 @@
+
+import torch
+
+from torchao.quantization.granularity import PerGroup, PerAxis
+from torchao.quantization.quant_api import (
+    IntxWeightOnlyConfig,
+    Int8DynamicActivationIntxWeightConfig,
+    quantize_,
+)
+from torchao.utils import unwrap_tensor_subclass
+from torch.export import export, ExportedProgram
+from executorch.exir import (
+    EdgeProgramManager,
+    ExecutorchBackendConfig,
+    ExecutorchProgramManager,
+)
+from executorch.backends.xnnpack.partition.config.xnnpack_config import (
+    ConfigPrecisionType,
+)
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
+    XnnpackFloatingPointPartitioner,
+    XnnpackPartitioner,
+)
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    to_edge_transform_and_lower,
+)
+# Quantize embeddings with 8-bits, per channel
+# embedding_config = IntxWeightOnlyConfig(
+#     weight_dtype=torch.int8,
+#     granularity=PerAxis(0),
+# )
+# qunatize_(
+#     eager_model,
+#     lambda m, fqn: isinstance(m, torch.nn.Embedding),
+# )
+
+torch.manual_seed(0)
+
+class ModuleLinear(torch.nn.Module):
+    def __init__(
+        self,
+        in_size: int = 2,
+        input_channels: int = 4,
+        output_channels: int = 4,
+        dtype: torch.dtype = torch.float,
+        use_bias: bool = False
+    ):
+        super().__init__()
+        self.linear = torch.nn.Linear(
+            input_channels, output_channels, bias=use_bias
+        ).to(dtype=dtype)
+
+        self.ic = input_channels
+        self.oc = output_channels
+        assert dtype in [torch.float, torch.half], "Unsupported op dtype"
+        self.op_dtype = dtype
+        self.in_size = in_size
+
+    def forward(self, x: torch.Tensor):
+        return self.linear(x)
+
+    def get_random_inputs(self):
+        inp = torch.randn(self.in_size, self.ic).to(self.op_dtype)
+        return (inp,)
+
+eager_model = ModuleLinear(
+    in_size=1,
+    input_channels=32,
+    output_channels=2,
+)
+
+test_inputs = eager_model.get_random_inputs()
+eager_result = eager_model(*test_inputs)
+print("eager result: ", eager_result)
+# Quatize linear layers with 8-bit dynamic activations and 4-bit weights
+linear_config = Int8DynamicActivationIntxWeightConfig(
+    weight_dtype=torch.int4,
+    weight_granularity=PerGroup(32),
+)
+quantize_(eager_model, linear_config)
+
+quantized_result = eager_model(*test_inputs)
+print("quantized results: ", quantized_result)
+print(torch.allclose(eager_result, quantized_result, atol=1e-1))
+
+unwrap_tensor_subclass(eager_model)
+unwrapped_result = eager_model(*test_inputs)
+print("unwrapped results: ", unwrapped_result)
+print(torch.allclose(quantized_result, unwrapped_result, atol=1e-3))
+
+from executorch.exir.passes.external_constants_pass import (
+    delegate_external_constants_pass_unlifted,
+)
+
+ep1 = export(eager_model, test_inputs, dynamic_shapes=None, strict=True)
+exported_result = ep1.module()(*test_inputs)
+print("exported program: ", exported_result)
+print(torch.allclose(quantized_result, exported_result, atol=1e-3))
+print("Graph: ")
+ep1.graph_module.print_readable()
+# Tag the unlifted ep.module().
+tagged_module = ep1.module()
+delegate_external_constants_pass_unlifted(
+    module=tagged_module,
+    gen_tag_fn=lambda x: "model", # This is the filename the weights will be saved to. In this case, weights will be saved as "model.ptd"
+)
+ep = export(tagged_module, test_inputs, dynamic_shapes=None, strict=True)
+exported_result = ep.module()(*test_inputs)
+print("exported program (after tagging): ", exported_result)
+print(torch.allclose(quantized_result, exported_result, atol=1e-3))
+# Check tagged nodes:
+for node in list(ep.graph.nodes):
+    if 'custom' in node.meta:
+        print(f"Node: {node.name}, meta: {node.meta['custom']}")
+
+DynamicallyQuantizedPartitioner = XnnpackPartitioner(
+    config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
+    per_op_mode=True,
+)
+edge = to_edge_transform_and_lower(
+    ep,
+    compile_config=EdgeCompileConfig(_check_ir_validity=False),
+    partitioner=[XnnpackPartitioner()],
+    generate_etrecord=False,
+)
+# ^ after this, the graph has a single node? torchao_dequantize_affine_default
+edge_result = edge.exported_program().module()(*test_inputs)
+print("edge program: ", edge_result)
+print(torch.allclose(quantized_result, edge_result, atol=1e-3))
+edge.exported_program().graph_module.print_readable()
+
+exec = edge.to_executorch(ExecutorchBackendConfig())
+exec_result = exec.exported_program().module()(*test_inputs)
+print("executorch program: ", exec_result)
+print(torch.allclose(quantized_result, exec_result, atol=1e-3))