Explicitly pass buffer sizes during memory planning when control flow submodule are around

sxu · facebook-github-bot · commit e5e7008761fd · 2024-11-13T16:04:05.000-08:00
Summary: It's less error prone to have the buffer sizes passed as parameter and return value than implicitly updated via `nonlocal` or reference stored on submodule. Fix a bug where if a new buffer is introduced within a submodule it gets ignored by the top level `apply_algo` call.

Differential Revision: D65915559
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
@@ -551,6 +551,7 @@ def greedy(
     graph_signature: Optional[ExportGraphSignature] = None,
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
+    input_buffer_sizes: Optional[List[int]] = None,
 ) -> List[int]:
     spec2obj = {}
     shared_objects = defaultdict(list)
@@ -574,18 +575,17 @@ def greedy(
 
     if len(shared_objects) == 0:
         # Cannot find any tensor in the graph that needs to be allocated.
-        # Return [0, 0] to be consistent with default behavior of naive.
-        total_sizes = [0, 0]
+        # Return the input sizes or [0, 0] to be consistent with default behavior of naive.
+        total_sizes = input_buffer_sizes or [0, 0]
     else:
-        total_sizes = [0] * (max(shared_objects.keys()) + 1)
+        num_buffers = max(shared_objects.keys()) + 1
+        if input_buffer_sizes is None:
+            total_sizes = [0] * num_buffers
+        else:
+            total_sizes = input_buffer_sizes + [0] * (num_buffers - len(input_buffer_sizes))
+
         for mem_id in shared_objects:
-            input_total_size = 0
-            if bufsizes := getattr(graph_module, "input_mem_buffer_sizes", None):
-                if len(bufsizes) > mem_id:
-                    input_total_size = bufsizes[mem_id]
-            total_sizes[mem_id] = materialize_buffer(
-                shared_objects[mem_id], input_total_size
-            )
+            total_sizes[mem_id] = materialize_buffer(shared_objects[mem_id], total_sizes[mem_id])
 
         # Since we now know the number of shared objects we need and the size of
         # each shared object, we can assign offset in the memory buffer for each
@@ -604,6 +604,7 @@ def naive(
     graph_signature: Optional[ExportGraphSignature] = None,
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
+    input_buffer_sizes: Optional[List[int]] = None,
 ) -> List[int]:
 
     # allocate 'allocated' bytes from buffer with id mem_id.
@@ -615,10 +616,7 @@ def _allocate_buf(bufsizes: List[int], mem_id: int, allocated: int) -> int:
         bufsizes[mem_id] += allocated
         return ret
 
-    bufsizes = getattr(graph_module, "input_mem_buffer_sizes", None)
-    if bufsizes is None:
-        bufsizes = [0, 0]
-
+    bufsizes = input_buffer_sizes or [0, 0]
     bufsizes = typing.cast(List[int], bufsizes)
     for spec in collect_specs_from_nodes(
         graph_module.graph.nodes,
@@ -727,6 +725,8 @@ def apply_algo(
     graph_signature: Optional[ExportGraphSignature] = None,
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
+    # the sizes of buffers already allocated when recursively applied on submodules
+    input_buffer_sizes: Optional[List[int]] = None,
 ) -> List[int]:
     """
     Recursively apply algo to graph_module and its submodules for control flow.
@@ -741,43 +741,46 @@ def apply_algo(
     """
     specs = update_all_tensors_lifetime(graph_module, graph_signature)
     bufsizes: List[int] = algo(
-        graph_module, alignment, graph_signature, alloc_graph_input, alloc_graph_output
+        graph_module,
+        alignment,
+        graph_signature,
+        alloc_graph_input,
+        alloc_graph_output,
+        input_buffer_sizes,
     )
     insert_calls_to_free(graph_module, specs)
 
     def handle_submodule(
-        submodule_nd: torch.fx.Node, alloc_graph_input: bool = False
+        submodule_nd: torch.fx.Node, current_buffer_sizes, alloc_graph_input: bool = False
     ) -> None:
-        nonlocal bufsizes
         assert submodule_nd.op == "get_attr"
         submodule = getattr(graph_module, submodule_nd.target)
-        # memory planning for submodule need to be aware of the amount of
-        # buffer already allocated.
-        submodule.input_mem_buffer_sizes = bufsizes
+        submodule.input_mem_buffer_sizes = current_buffer_sizes
         bufsizes = apply_algo(
             algo,
             submodule,
             alignment,
             graph_signature,
             alloc_graph_input=alloc_graph_input,
             alloc_graph_output=True,
+            input_buffer_sizes=current_buffer_sizes,
         )
         submodule.meta.update({"non_const_buffer_sizes": bufsizes})
+        return bufsizes
 
     for cond_node in get_cond_nodes(graph_module):
-        handle_submodule(typing.cast(torch.fx.Node, cond_node.args[1]))
-        handle_submodule(typing.cast(torch.fx.Node, cond_node.args[2]))
+        bufsizes = handle_submodule(typing.cast(torch.fx.Node, cond_node.args[1]), bufsizes)
+        bufsizes = handle_submodule(typing.cast(torch.fx.Node, cond_node.args[2]), bufsizes)
 
     for while_node in get_while_nodes(graph_module):
-        handle_submodule(typing.cast(torch.fx.Node, while_node.args[0]))
-        handle_submodule(typing.cast(torch.fx.Node, while_node.args[1]))
+        bufsizes = handle_submodule(typing.cast(torch.fx.Node, while_node.args[0]), bufsizes)
+        bufsizes = handle_submodule(typing.cast(torch.fx.Node, while_node.args[1]), bufsizes)
     # TODO: Add test coverage for map operator once dynamo tracing is
     # fully supported for this. T142287208
     for map_node in get_map_nodes(graph_module):
-        handle_submodule(
-            typing.cast(torch.fx.Node, map_node.args[0]), alloc_graph_input=True
+        bufsizes = handle_submodule(
+            typing.cast(torch.fx.Node, map_node.args[0]), bufsizes, alloc_graph_input=True
         )
 
     graph_module.meta.update({"non_const_buffer_sizes": bufsizes})
-
     return bufsizes
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
@@ -524,6 +524,68 @@ def test_multiple_pools(
                 idx += 1
         self.assertEqual(graph_module.meta["non_const_buffer_sizes"], expected_bufsizes)
 
+    def test_multiple_pools_with_cond(self) -> None:
+        class MultiplePoolsWithCondToyModel(torch.nn.Module):
+            def forward(self, b, x):
+                def true_fn(x):
+                    return x + x
+
+                def false_fn(x):
+                    return x * x
+
+                return torch.cond(b, true_fn, false_fn, (x,))
+
+        edge_program = to_edge(
+            export(
+                MultiplePoolsWithCondToyModel(),
+                (torch.tensor([True], dtype=torch.bool), torch.ones(1)),
+            )
+        )
+
+        edge_program.to_executorch(
+            exir.ExecutorchBackendConfig(
+                memory_planning_pass=CustomPoolMemoryPlanningPass(
+                    memory_planning_algo=greedy,
+                    alignment=1,
+                ),
+            )
+        )
+        graph_module = edge_program.exported_program().graph_module
+
+        verifier = Verifier(
+            graph_module,
+            alloc_graph_input=True,
+            alloc_graph_output=True,
+        )
+        verifier.verify_storage_reuse()
+        verifier.verify_graph_input_output()
+
+        true_gm = None
+        false_gm = None
+        for node in graph_module.graph.nodes:
+            if node.target == torch.ops.higher_order.cond:
+                true_gm = getattr(graph_module, node.args[1].target)
+                false_gm = getattr(graph_module, node.args[2].target)
+
+        self.assertTrue(true_gm is not None and false_gm is not None)
+        for node in true_gm.graph.nodes:
+            if node.op == "call_function" and node.target == torch.ops.aten.add.out:
+                # true_fn calls add, for which the custom planning assign mem_id 3
+                spec = node.meta.get("spec")
+                self.assertTrue(spec is not None)
+                self.assertEqual(spec.mem_id, 3)
+                self.assertEqual(spec.mem_offset, 0)
+
+        for node in false_gm.graph.nodes:
+            if node.op == "call_function" and node.target == torch.ops.aten.mul.out:
+                # false_fn calls mul, for which the custom planning assign mem_id 1
+                spec = node.meta.get("spec")
+                self.assertTrue(spec is not None)
+                self.assertEqual(spec.mem_id, 1)
+                self.assertEqual(spec.mem_offset, 9)
+
+        self.assertEqual(graph_module.meta["non_const_buffer_sizes"], [0, 13, 0, 4])
+
     def test_constants_not_memory_planned(self) -> None:
         class Simple(torch.nn.Module):
             def __init__(self) -> None: