Support Empty Input Tensors and > 5 Cat Inputs

mcr229 · facebook-github-bot · commit 20a2133aa6db · 2025-01-22T14:55:46.000-08:00
Summary:
PyTorch's cat.default operator can take in arbitrarily large number of inputs. This is because the input is a Tensor List. XNNPACK however supports largest of 5 input tensors at a time. It is common for &gt; 5 input tensors to be concatenated together, so we should still support cat's with this operation. We can do so by adding a pass which decomposes the Cat operator. The first 5 operators can be concatenated together, and then we recursively inject more concatenate nodes to concatenate the result of the last operation with the next 4 input tensors.

Another common design pattern is for Concatenates to start with an empty tensor and then concatenat tensors together into it. This results in some empty tensors as inputs to concatenate. 

Previously we don't partition inputs with empty tensors. I don't remember what the case was with empty tensors, but it seems to work now, so disabling that partitioner check for now. Perhaps CI will pick up an error if this is indeed erroronous

Differential Revision: D68523312
diff --git a/backends/xnnpack/_passes/TARGETS b/backends/xnnpack/_passes/TARGETS
@@ -4,20 +4,7 @@ oncall("executorch")
 
 python_library(
     name = "xnnpack_passes",
-    srcs = [
-        "__init__.py",
-        "channels_last_tagged_reshape_pass.py",
-        "conv1d_unsqueeze_pass.py",
-        "convert_to_linear.py",
-        "convert_to_sdpa.py",
-        "convert_to_upsample_bilinear2d.py",
-        "fuse_activation_pass.py",
-        "fuse_batch_norm_with_conv.py",
-        "prelu_reshape_pass.py",
-        "remove_getitem_op.py",
-        "tag_implicit_q_dq_pass.py",
-        "xnnpack_pass.py",
-    ],
+    srcs = native.glob(["*.py"]),
     deps = [
         "//caffe2:torch",
         "//executorch/backends/transforms:addmm_mm_to_linear",
diff --git a/backends/xnnpack/_passes/__init__.py b/backends/xnnpack/_passes/__init__.py
@@ -21,6 +21,7 @@
 from executorch.backends.xnnpack._passes.fuse_batch_norm_with_conv import (
     FuseBatchNormWithConvPass,
 )
+from executorch.backends.xnnpack._passes.decompose_cat import DecomposeConcatenate
 from executorch.backends.xnnpack._passes.prelu_reshape_pass import PReLUReshapePass
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
@@ -63,6 +64,7 @@ def __init__(
                 ConstPropPass,
                 FuseBatchNormWithConvPass,
                 FuseActivationPass,
+                DecomposeConcatenate,
                 RemoveGetItemPass,
                 Conv1dUnsqueezePass,
                 PReLUReshapePass,
diff --git a/backends/xnnpack/_passes/decompose_cat.py b/backends/xnnpack/_passes/decompose_cat.py
@@ -0,0 +1,80 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.backends.xnnpack.utils.quant_utils import (
+    is_dequant,
+    is_quant,
+)
+
+
+class DecomposeConcatenate(ExportPass):
+    """
+    XNNPACK's Concatenate operation only supports concatenation for <= 5 tensors
+    at a time. As a result, to support concatenates with > 5 tensors, we can decompose
+    concatenates into sequences of cats each with <= 5 tensors.
+
+    Example:
+    Before Pass:
+        cat: "f32" = torch.ops.aten.cat.default([t1, t2, t3, t4, t5, t6], 1); 
+    
+    After Pass:
+        cat: "f32" = torch.ops.aten.cat.default([t1, t2, t3, t4, t5], 1); 
+        cat_1: "f32" = torch.ops.aten.cat.default([cat, t6], 1); 
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        mdule = graph_module
+        for node in mdule.graph.nodes:
+            if (node.op == "call_function" 
+                and node.target.__name__ == "aten.cat.default"):
+                concat_args = node.args
+                nodes_to_concat = node.args[0]
+                if len(nodes_to_concat) <= 5:
+                    continue
+                
+                is_quantized = (all(is_dequant(node) for node in nodes_to_concat) 
+                                and all(is_quant(node) for node in node.users.keys()))
+
+                # replace the cat args with the same args but only with the first 5 nodes
+                new_concat_args = (nodes_to_concat[:5],) + concat_args[1:]
+                node.args =  new_concat_args
+
+                remainder_nodes_to_concat = nodes_to_concat[5:]
+                with mdule.graph.inserting_after(node):
+                    remainder_concat_node = mdule.graph.create_node(
+                        "call_function",
+                        target=exir_ops.edge.aten.cat.default,
+                        args=([],), # we will replace this remainder_nodes later
+                        kwargs=node.kwargs,
+                    )
+                node.replace_all_uses_with(remainder_concat_node)
+                if is_quantized:
+                    q_params = nodes_to_concat[0].args[1:]
+                    with mdule.graph.inserting_after(node):
+                        q_node = mdule.graph.create_node(
+                            "call_function",
+                            target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                            args=(node,) + q_params,
+                            kwargs=node.kwargs,
+                        )
+                    with mdule.graph.inserting_after(q_node):
+                        dq_node = mdule.graph.create_node(
+                            "call_function",
+                            target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+                            args=(q_node,) + q_params,
+                            kwargs=node.kwargs,
+                        )
+                    remainder_concat_node.args = ([dq_node] + remainder_nodes_to_concat,) + node.args[1:]
+                else:
+                    remainder_concat_node.args = ([node] + remainder_nodes_to_concat,) + node.args[1:]
+        
+        mdule.recompile()
+        mdul = super().call(mdule).graph_module
+        return PassResult(mdul, True)
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -181,10 +181,10 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
 
         num_tensors = len(node.all_input_nodes)
 
-        if not (num_tensors >= 2 and num_tensors <= 5):
+        if not (num_tensors >= 2):
             why(
                 node,
-                reason=f"only support concatenation of 2 - 5 tensors, got {num_tensors} tensors",
+                reason=f"only support concatenation of > 2 tensors, got {num_tensors} tensors",
             )
             return False
 
diff --git a/backends/xnnpack/partition/config/xnnpack_config.py b/backends/xnnpack/partition/config/xnnpack_config.py
@@ -165,10 +165,6 @@ def _check_inputs_are_valid_dtypes(self, node, valid_dtypes):
             if not isinstance(arg_val, torch.Tensor):
                 return False
 
-            # XNNPACK does not support empty tensors
-            if arg_val.numel() == 0:
-                return False
-
             if arg_val.dtype not in valid_dtypes:
                 return False
 
diff --git a/backends/xnnpack/test/ops/test_cat.py b/backends/xnnpack/test/ops/test_cat.py
@@ -93,6 +93,29 @@ def test_fp16_cat4(self):
         )
         self._test_cat(self.Cat(), inputs)
 
+    def test_fp16_cat5(self):
+        """
+        Using Clamp2 because fp16 add is done in fp32 ATM. Need to fix that first.
+        """
+        inputs = (
+            torch.randn(1, 2, 3).to(torch.float16),
+            torch.randn(3, 2, 3).to(torch.float16),
+            torch.randn(2, 2, 3).to(torch.float16),
+            torch.randn(5, 2, 3).to(torch.float16),
+            torch.randn(5, 2, 3).to(torch.float16),
+        )
+        self._test_cat(self.Cat(), inputs)
+
+    def test_fp16_cat_gt_5(self):
+        """
+        Using Clamp2 because fp16 add is done in fp32 ATM. Need to fix that first.
+        """
+        for num_inputs in range(6, 10):
+            inputs = []
+            for _ in range(num_inputs):
+                inputs.append(torch.randn(1, 2, 3).to(torch.float16))
+            self._test_cat(self.Cat(), tuple(inputs))
+
     def test_fp32_cat2(self):
         inputs = (torch.randn(1, 2, 3), torch.randn(3, 2, 3))
         self._test_cat(self.Cat(), inputs)
@@ -120,6 +143,13 @@ def test_fp32_cat5(self):
         )
         self._test_cat(self.Cat(), inputs)
 
+    def test_fp32_cat_gt_5(self):
+        for num_inputs in range(6, 10):
+            inputs = []
+            for _ in range(num_inputs):
+                inputs.append(torch.randn(1, 2, 3))
+            self._test_cat(self.Cat(), tuple(inputs))
+
     def test_qs8_cat2(self):
         inputs = (torch.randn(1, 2, 3), torch.randn(3, 2, 3))
         self._test_cat(self.Cat(), inputs, cat_num=2, quant=True)
@@ -137,46 +167,26 @@ def test_qs8_cat4(self):
         )
         self._test_cat(self.Cat(), inputs, cat_num=4, quant=True)
 
-    def test_fp32_cat_unsupported(self):
-        """
-        XNNPACK only supports concatenating up to 4 values, so it should not delegate here.
-        """
+    def test_qs8_cat5(self):
         inputs = (
             torch.randn(1, 2, 3),
             torch.randn(3, 2, 3),
             torch.randn(2, 2, 3),
             torch.randn(5, 2, 3),
-            torch.randn(1, 2, 3),
-            torch.randn(2, 2, 3),
-        )
-        (
-            Tester(self.Cat(), inputs)
-            .export()
-            .check_count({"torch.ops.aten.cat": 1})
-            .to_edge_transform_and_lower()
-            .check_count({"executorch_exir_dialects_edge__ops_aten_cat": 1})
-        )
-
-    def test_fp32_cat_unsupported_legacy_mode(self):
-        """
-        XNNPACK only supports concatenating up to 5 values, so it should not delegate here.
-        """
-        inputs = (
-            torch.randn(1, 2, 3),
-            torch.randn(3, 2, 3),
-            torch.randn(2, 2, 3),
             torch.randn(5, 2, 3),
-            torch.randn(1, 2, 3),
-            torch.randn(6, 2, 3),
-        )
-        (
-            Tester(self.Cat(), inputs)
-            .export()
-            .check_count({"torch.ops.aten.cat": 1})
-            .to_edge()
-            .partition()
-            .check_count({"executorch_exir_dialects_edge__ops_aten_cat": 1})
         )
+        self._test_cat(self.Cat(), inputs, cat_num=5, quant=True)
+
+    def test_qs8_cat_gt_5(self):
+        for num_inputs in range(6, 10):
+            inputs = []
+            for _ in range(num_inputs):
+                inputs.append(torch.randn(1, 2, 3))
+            self._test_cat(self.Cat(), tuple(inputs), cat_num=num_inputs, quant=True)
+    
+    def test_qs8_cat_with_empty_tensor(self):
+        inputs = (torch.randn(1, 2, 3), torch.randn(3, 2, 3), torch.randn(0, 2, 3))
+        self._test_cat(self.Cat(), inputs, cat_num=3, quant=True)
 
     class CatNegativeDim(torch.nn.Module):
         def __init__(self):
diff --git a/backends/xnnpack/test/passes/test_decompose_cat_pass.py b/backends/xnnpack/test/passes/test_decompose_cat_pass.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+import math
+from executorch.backends.xnnpack._passes.decompose_cat import DecomposeConcatenate
+from executorch.backends.xnnpack.test.tester import RunPasses, Tester
+
+
+class TestDecomposeCatPass(unittest.TestCase):
+    PassStage = RunPasses([DecomposeConcatenate])
+    cat_name = "executorch_exir_dialects_edge__ops_aten_cat_default"
+    class Cat(torch.nn.Module):
+        def forward(self, *args):
+            xs = [*args]
+            x = torch.cat(xs)
+            return x + x  # Quantize by propagation.
+
+    def test_cat_gt_5(self):
+        inputs = [
+            torch.randn(1, 2, 3),
+        ]
+        for num_inputs in range(6, 10):
+            inputs = []
+            for _ in range(num_inputs):
+                inputs.append(torch.randn(1, 2, 3))
+            
+            num_cats = int(len(inputs) > 5)
+            num_cats += math.ceil((len(inputs) - 5)/4)
+            (
+                Tester(self.Cat(), tuple(inputs))
+                .export()
+                .to_edge()
+                .check_count({self.cat_name: 1})
+                .run_passes(self.PassStage)
+                .check_count({self.cat_name: num_cats})
+                .run_method_and_compare_outputs()
+            )