Add option to enforce alignment constraint when planning memory

Eashan Garg · facebook-github-bot · commit d9481d74f2c2 · 2025-01-28T10:21:57.000-08:00
Summary: Add ability to enforce start alignment in Cadence Memory Planner

Differential Revision: D68762973
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -257,6 +257,7 @@ def export_to_executorch_gen_etrecord(
     alloc_graph_output: bool = True,
     memory_config: Optional[MemoryConfig] = None,
     dump_graphs: bool = False,
+    mem_alignment: int = 0,
 ) -> ExecutorchProgramManager:
     cadence_passes = get_cadence_passes(opt_level)
     edge_prog_manager = export_to_edge(model, inputs, dump_graphs)
@@ -283,6 +284,7 @@ def export_to_executorch_gen_etrecord(
         mem_algo=mem_algo,
         alloc_graph_input=alloc_graph_input,
         alloc_graph_output=alloc_graph_output,
+        mem_alignment=mem_alignment,
     )
 
     # Get executorch program after Cadence specific passes
diff --git a/backends/cadence/aot/memory_planning.py b/backends/cadence/aot/memory_planning.py
@@ -9,6 +9,7 @@
 import collections
 import itertools
 import logging
+import math
 import typing
 from functools import partial
 from typing import Iterable, List, Optional, Tuple
@@ -39,6 +40,12 @@ def get_size(memory_config: MemoryConfig, exir_id: int) -> int:
     return memory_config.memory_sizes[exir_id - 1]
 
 
+def get_aligned_offset(pre_aligned_offset: int, alignment: int) -> int:
+    if alignment == 0:
+        return pre_aligned_offset
+    return int(math.ceil(pre_aligned_offset / alignment) * alignment)
+
+
 def collect_specs_from_graph_module(
     graph_module: torch.fx.GraphModule,
     alloc_graph_input: bool,
@@ -95,7 +102,7 @@ def overlap(spec: TensorSpec) -> Optional[TensorSpec]:
         return None
 
     def memory_available(spec: TensorSpec) -> bool:
-        return spec.mem_offset + spec.allocated_memory <= get_size(
+        return get_aligned_offset(spec.mem_offset + spec.allocated_memory, alignment) <= get_size(
             memory_config, spec.mem_id
         )
 
@@ -116,7 +123,7 @@ def memory_available(spec: TensorSpec) -> bool:
                 continue
             spec.mem_offset = 0
             while memory_available(spec) and (overlapped := overlap(spec)):
-                spec.mem_offset = overlapped.mem_offset + overlapped.allocated_memory
+                spec.mem_offset = get_aligned_offset(overlapped.mem_offset + overlapped.allocated_memory, alignment)
             if memory_available(spec):
                 allocated_buffers[spec.mem_id].append(spec)
                 bufsizes[spec.mem_id] = max(
@@ -202,11 +209,11 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
                     # calculation of gap incorrect. Moving it out will make the algorithm degenerate
                     # to the naive one, reusing 0 tensor. The paper may have a typo here.
                     prev_offset = max(
-                        allocated_spec.mem_offset + allocated_spec.allocated_memory,
+                        get_aligned_offset(allocated_spec.mem_offset + allocated_spec.allocated_memory, alignment),
                         prev_offset,
                     )
             if spec.mem_offset is None:
-                if prev_offset + spec.allocated_memory > get_size(
+                if get_aligned_offset(prev_offset + spec.allocated_memory, alignment) > get_size(
                     memory_config, spec.mem_id
                 ):
                     continue
@@ -423,6 +430,7 @@ def __init__(
                 ]
             ]
         ] = None,
+        mem_alignment: int = 0,
     ) -> None:
         self._init_mem_algos()
 
@@ -432,6 +440,7 @@ def __init__(
         self.alloc_graph_input = alloc_graph_input
         self.alloc_graph_output = alloc_graph_output
         self.additional_constraint_gen_passes = additional_constraint_gen_passes
+        self.mem_alignment = mem_alignment
 
     def _init_mem_algos(self) -> None:
         self.available_mem_algos = [
@@ -459,6 +468,7 @@ def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
             allow_lifetime_and_storage_overlap=(self.opt_level >= 2),
             alloc_graph_input=self.alloc_graph_input,
             alloc_graph_output=self.alloc_graph_output,
+            alignment=self.mem_alignment,
         )
         mem_planning(graph_module)
 
diff --git a/backends/cadence/aot/tests/test_memory_passes.py b/backends/cadence/aot/tests/test_memory_passes.py
@@ -10,9 +10,14 @@
 from executorch.backends.cadence.aot import compiler
 from executorch.backends.cadence.aot.memory_planning import find_peak_memory_usage
 from executorch.backends.cadence.aot.pass_utils import count_node
+from executorch.backends.cadence.aot.utils import (
+    get_default_memory_config,
+    MemoryConfig,
+)
 from executorch.exir import memory
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.tests.models import MultiLayerPerceptron
+from executorch.exir.memory_planning import collect_specs_from_nodes
 
 
 class TestMemPlanningPasses(unittest.TestCase):
@@ -762,3 +767,39 @@ def forward(self, x, y):
         )
         self.assertEqual(count_node(graph_module, memory.view), 1)
         self.verify_nop_memory_alloc(graph_module)
+
+
+    def test_start_alignment_constraints(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor):
+                add_0 = torch.add(x, y)
+                add_1 = torch.add(x, add_0)
+                add_2 = torch.add(add_0, add_1)
+                add_3 = torch.add(add_1, add_2)
+                return add_3
+
+        model = Model()
+        inputs = (torch.randn(4, 17), torch.randn(4, 17))
+        for mem_algo in range(0, 2):
+            graph_module = (
+                compiler.export_to_executorch_gen_etrecord(
+                    model,
+                    inputs,
+                    opt_level=1,
+                    mem_algo=mem_algo,
+                    alloc_graph_input=False,
+                    alloc_graph_output=False,
+                    mem_alignment=32,
+                )
+                .exported_program()
+                .graph_module
+            )
+            # Assert that all memory allocations are aligned to 32B start address
+            for spec in collect_specs_from_nodes(
+                graph_module.graph.nodes, False, False
+            ):
+                if spec and spec.mem_offset:
+                    self.assertEqual(spec.mem_offset % 32, 0)