Allow Partitioner to Force Dynamic Linear Computation (#5338)

mcr229 · facebook-github-bot · commit 71602a0e5b9c · 2024-09-13T14:53:36.000-07:00
Summary: Pull Request resolved: #5338 # Motivation A current drawback to XNNPACK is that weights are duplicated across delegate instances if they do not soley belong to one partition. For ops like LSTM, they use the same few weights and bias's in multiple linear nodes. This can explode out LSTM as we have to duplicate the LSTM Weight/Bias for every instance of linear. XNNPACK has dynamic linear in which weights are given at runtime, rather than packed AoT. This allows us to force the partitioner to not partition weights so XNNPACK delegate does not own the weights, and thus does not duplicate them. This is only supported for FP32 weights atm, but we can leverage this to balance between slower perf with smaller file sizes. Reviewed By: GregoryComer Differential Revision: D62621998 fbshipit-source-id: 646f25af5f532718e88695173b9c17b6b03ff293
diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
@@ -52,8 +52,8 @@ class GEMMConfig(XNNPartitionerConfig):
     different ops
     """
 
-    def __init__(self, weight_idx, bias_idx, act_idx, fused_acts):
-        super().__init__()
+    def __init__(self, weight_idx, bias_idx, act_idx, fused_acts, **kwargs):
+        super().__init__(**kwargs)
         self.weight_idx = weight_idx
         self.bias_idx = bias_idx
         self.act_idx = act_idx
@@ -250,17 +250,28 @@ def _get_act_deps(
 class LinearConfig(GEMMConfig):
     target_name = "linear.default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(
             weight_idx=1,
             bias_idx=2,
             act_idx=0,
             fused_acts=["relu.default", "hardtanh.default"],
+            **kwargs,
         )
 
     def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
         return torch.ops.aten.linear.default
 
+    def _get_weight_deps(
+        self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
+    ) -> Tuple[bool, List[torch.fx.Node]]:
+        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
+            # if force fp32_dynamic_linear is on and we detected this as fp32, then we
+            # do not partition the weight node
+            return (True, [])
+
+        return super()._get_weight_deps(node, ep, precision)
+
     def supported_precision_types(self):
         return [
             ConfigPrecisionType.DYNAMIC_QUANT,
@@ -272,12 +283,13 @@ def supported_precision_types(self):
 class ConvolutionConfig(GEMMConfig):
     target_name = "convolution.default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(
             weight_idx=1,
             bias_idx=2,
             act_idx=0,
             fused_acts=["relu.default", "hardtanh.default"],
+            **kwargs,
         )
 
     def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
@@ -314,12 +326,13 @@ class AddmmConfig(GEMMConfig):
 
     target_name = "addmm.default"
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__(
             weight_idx=2,
             bias_idx=0,
             act_idx=1,
             fused_acts=["relu.default", "hardtanh.default"],
+            **kwargs,
         )
         self.src_partitions = None
         self.linear_modules = [torch.nn.functional.linear, torch.nn.Linear]
@@ -417,8 +430,8 @@ def supported_precision_types(self):
 class MMConfig(AddmmConfig):
     target_name = "mm.default"
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
         self.bias_idx = None
         self.weight_idx = 1
         self.act_idx = 0
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -25,13 +25,13 @@
 
 
 class GenericNodePartitionerConfig(XNNPartitionerConfig):
-    def __init__(self, fused_act: Optional[List[str]] = None):
+    def __init__(self, fused_act: Optional[List[str]] = None, **kwargs):
         """
         fused_act is a list of node target names that can be fused with this
         node under quantization
         """
         self.fused_acts = fused_act or []
-        super().__init__()
+        super().__init__(**kwargs)
 
     def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
         return self.check_common_constraints(node, ep)
@@ -98,8 +98,8 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]:
 class AddConfig(GenericNodePartitionerConfig):
     target_name = "add.Tensor"
 
-    def __init__(self):
-        super().__init__(fused_act=["relu.default"])
+    def __init__(self, **kwargs):
+        super().__init__(fused_act=["relu.default"], **kwargs)
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT]
diff --git a/backends/xnnpack/partition/config/xnnpack_config.py b/backends/xnnpack/partition/config/xnnpack_config.py
@@ -37,9 +37,11 @@ class XNNPartitionerConfig(PartitionerConfig):
     types they want to enable
     """
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         super().__init__()
         self.enabled_precision_types = self.supported_precision_types()
+        # Flag used in GEMMConfig()
+        self.force_fp32_dynamic_linear = kwargs.get("force_fp32_dynamic_linear", False)
 
     def get_partition(
         self, node: torch.fx.Node, ep: ExportedProgram
diff --git a/backends/xnnpack/partition/xnnpack_partitioner.py b/backends/xnnpack/partition/xnnpack_partitioner.py
@@ -36,6 +36,7 @@ def __init__(
         ] = None,
         per_op_mode=False,
         verbose: bool = False,
+        **kwargs,
     ):
         """
         @verbose: if True, print out more information about the partitioner.
@@ -55,7 +56,7 @@ def __init__(
 
         for config in configs_to_use:
             # Config Classes given to XnnpackPartitioner should no longer be abstract
-            initialized = config()  #  pyre-ignore
+            initialized = config(**kwargs)  #  pyre-ignore
             initialized.set_enabled_precision_types(config_precisions)
             initialized_configs.append(initialized)
 
diff --git a/backends/xnnpack/test/ops/lstm.py b/backends/xnnpack/test/ops/lstm.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+from executorch.backends.xnnpack.test.tester import Tester
+from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
+
+
+class TestLSTM(unittest.TestCase):
+    class LSTMLinear(torch.nn.Module):
+        def __init__(self, input_size, hidden_size, out_size):
+            super().__init__()
+            self.lstm = torch.nn.LSTM(
+                input_size=input_size, hidden_size=hidden_size, batch_first=True
+            )
+            self.linear = torch.nn.Linear(hidden_size, hidden_size)
+            self.linear2 = torch.nn.Linear(hidden_size, out_size)
+
+        def forward(self, x):
+            x, hs = self.lstm(x)
+            x = self.linear(x[:, -1, :])
+            x = self.linear2(x)
+            return torch.nn.functional.log_softmax(x, dim=1)
+
+    def test_fp32_lstm(self):
+        (
+            Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),))
+            .export()
+            .to_edge_transform_and_lower()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"])
+            .check_not(
+                ["p_lstm_weight", "p_lstm_bias"]
+            )  # These Should be Consumed by Delegate
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp32_lstm_force_dynamic_linear(self):
+        (
+            Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),))
+            .export()
+            .to_edge_transform_and_lower(
+                ToEdgeTransformAndLower(
+                    partitioners=[XnnpackPartitioner(force_fp32_dynamic_linear=True)]
+                )
+            )
+            .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"])
+            # Weights are supplied as input to linears
+            .check(["p_lstm_weight_hh_l0", "p_lstm_weight_ih_l0"])
+            # Biases are owned by delegates
+            .check_not(["p_lstm_bias"])
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )