From 31f28e2680b73bddb7872c5d94c5122c3d6a9ecd Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Thu, 23 Jan 2025 21:16:21 -0800 Subject: [PATCH 1/3] [ET][Memory planning] Improve greedy memory planning. This diff replaces the old greedy algorithm. Older algorithm resulted in 35% worse compared to theoretical optimum. THis matter for long context even more since additional overhead can be few hundred MB. For example the theorical optimial for llama3_2 8B, 4-bit quantized modelw ith context length of 2k needs about 1G of memory. This theoretcial max can be observed by looking at the peaks in memory profile. Current agorithm resulted in about 1.6GB of planned memory. New algorithm reduce that to about 1.1G. Differential Revision: [D68448332](https://our.internmc.facebook.com/intern/diff/D68448332/) [ghstack-poisoned] --- exir/memory_planning.py | 165 ++++++++++++++++++++++++---- exir/passes/memory_planning_pass.py | 1 + exir/tests/test_memory_planning.py | 44 +++++++- 3 files changed, 185 insertions(+), 25 deletions(-) diff --git a/exir/memory_planning.py b/exir/memory_planning.py index be471b6f745..f7cd6e7a6db 100644 --- a/exir/memory_planning.py +++ b/exir/memory_planning.py @@ -11,7 +11,7 @@ import operator import typing from collections import defaultdict -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union import torch @@ -454,6 +454,18 @@ def update_all_tensors_lifetime( return specs +@dataclass +class AllocationSpec: + """ + AllocationSpec is used to represent the allocation of a tensor. + """ + + # The offset of the tensor in the shared object/pool. + offset: int + # TensorSpec + spec: TensorSpec + + @dataclass class SharedObject: r""" @@ -470,8 +482,15 @@ class SharedObject: offset: int # size of this shared object in bytes size: int + # When the object is first created + first_used_index: int # the object will be available for index (last_used_index + 1) last_used_index: int + # list of allocations belong to this shared object + allocations: List[AllocationSpec] = field(default_factory=list) + + def __repr__(self) -> str: + return f"SharedObject(idx={self.idx}, offset={self.offset}, size={self.size}, lifetime=[{self.first_used_index, self.last_used_index}])" def materialize_buffer( @@ -489,35 +508,122 @@ def materialize_buffer( return total_size -def _size_abs_dif(sobj: SharedObject, spec: TensorSpec) -> int: +def _does_not_overlap(sobj: SharedObject, spec: TensorSpec) -> bool: r""" - Calculate the absolute different between the size of a shared object and - a tensor. + Check if a shared object and a tensor do not overlap. """ - return abs(sobj.size - spec.allocated_memory) + for alloc in sobj.allocations: + if not ( + spec.lifetime[1] < alloc.spec.lifetime[0] + or spec.lifetime[0] > alloc.spec.lifetime[1] + ): + return False + return True + + +def _find_max_overlapping_allocations_offset( + sobj: SharedObject, spec: TensorSpec +) -> int: + max_offset = 0 + for alloc in sobj.allocations: + if ( + spec.lifetime[1] < alloc.spec.lifetime[0] + or spec.lifetime[0] > alloc.spec.lifetime[1] + ): + continue + max_offset = max(alloc.offset + alloc.spec.allocated_memory, max_offset) + return max_offset def pick_shared_obj( shared_objects: List[SharedObject], spec: TensorSpec ) -> SharedObject: r""" - Pick the available shared object with closest size to the tensor. - If there are no available shared object left, create a new one. + Pick the available shared object to which to assign this spec, + or create a new one + Algorithm details + Previous: Look at every spec in chronological order. Find if previously allocated object + allows it to fit in. If not, allocate a new object. + New: + - Sort all the specs by allocation size + - Process the specs in order + - If the spec's size in smaller than previously allocated buckets: + - Conditions under which previously allocated bucket can be used: + - Lifetime of the spec does not overlap with lifetime of the bucket. + - In this case allocate spec to that bucket and expand its lifetime. + - Spec is allocated at offset = 0 in this bucket. + - Add this spec to allocated object's list of specs. + - Lifetime of the spec overlaps with lifetime of the bucket, + partially or fully (e.g. spec's lifetime subset of bucket's lifetime) + - If none of the specs in the bucket overlaps with spec's lifetime. + - Allocate spec to the bucket at offset = 0. + - Add this spec to the bucket's list of specs. + - Expand bucket's lifetime accounting for added spec's lifetime. + - If one or more specs in the bucket overlaps with spec's lifetime. + - Collect offsets (at which the given overlapping spec is allocated in the bucket). + of all the overlapping specs, and find the max offset. + - Allocate spec to the bucket at offset = max_offset + max_offset_spec_size. + - Add this spec to the bucket's list of specs. + - Expand bucket's lifetime accounting for added spec's lifetime. + - If none of these conditions are met, allocate a new bucket. + - Add spec to this bucket. + - Update bucket's lifetime to that of the spec. + - If the spec's size is larger than previously allocated buckets, allocate a new bucket. + - Size and lifetime of this bucket is that of the spec + + Proof of correctness: + - If allocating a new bucket, it is correct. + - If allocating spec to an existing bucket, whose lifetime does not overlap with any + of the previously allocated specs' lifetime, then the allocation is correct. + Proof of correctness by induction when adding spec to an existing bucket: + - If all previous allocations in the given bucket are correct: + - Then the new one being added must be correct because when the requested allocation + overlaps with one or more previous allocations, we find the largest offset among + all the overlapping allocations, and allocate the new spec at that offset. Hence, + the allocation at such an offset, will not overlap with any previous allocations. + Base case: A newly added allocation within a bucket with single allocation is correct: + because a) it must fit and b) its lifetime must not overlap with object's lifetime. + This holds true because of the following invariants: + - Once a bucket is created, it is never resized. + - All the allocations within a bucket follow this: + - Span, defined by allocation's offset + size, of two allocations can only overlap, + if their timelines do not overlap. """ - # TODO: do better than linear scan picked = None for sobj in shared_objects: - if spec.lifetime[0] > sobj.last_used_index: - if picked is None or _size_abs_dif(sobj, spec) < _size_abs_dif( - picked, spec - ): - picked = sobj - sobj.last_used_index = spec.lifetime[1] - sobj.size = max(sobj.size, spec.allocated_memory) + if _does_not_overlap(sobj, spec): + assert sobj.size >= spec.allocated_memory, "Allocation specs are not sorted" + picked = sobj + sobj.first_used_index = min(sobj.first_used_index, spec.lifetime[0]) + sobj.last_used_index = max(sobj.last_used_index, spec.lifetime[1]) + allocation_spec = AllocationSpec(0, spec) + picked.allocations.append(allocation_spec) + break + + if picked is None: + for sobj in shared_objects: + max_offset = _find_max_overlapping_allocations_offset(sobj, spec) + if max_offset > 0: + if max_offset + spec.allocated_memory <= sobj.size: + picked = sobj + sobj.first_used_index = min(sobj.first_used_index, spec.lifetime[0]) + sobj.last_used_index = max(sobj.last_used_index, spec.lifetime[1]) + allocation_spec = AllocationSpec(max_offset, spec) + picked.allocations.append(allocation_spec) + break + if picked is None: picked = SharedObject( - len(shared_objects), -1, spec.allocated_memory, spec.lifetime[1] + len(shared_objects), + -1, + spec.allocated_memory, + spec.lifetime[0], + spec.lifetime[1], ) + allocation_spec = AllocationSpec(0, spec) + picked.allocations.append(allocation_spec) + picked.first_used_index = spec.lifetime[0] + picked.last_used_index = spec.lifetime[1] shared_objects.append(picked) return picked @@ -565,6 +671,9 @@ def greedy( # For each tensor, pick the available shared object with closest size to # the tensor. If there are no available shared object left, create a new # one. + import bisect + + sorted_specs = [] for spec in collect_specs_from_nodes( graph_module.graph.nodes, graph_signature, @@ -572,6 +681,10 @@ def greedy( ignore_graph_input=not alloc_graph_input, ignore_graph_output=not alloc_graph_output, ): + bisect.insort(sorted_specs, spec, key=lambda x: x.allocated_memory) + sorted_specs.reverse() + + for spec in sorted_specs: if spec.mem_id is None: spec.mem_id = 1 spec.realign(alignment) @@ -583,6 +696,7 @@ def greedy( total_sizes = [0, 0] else: total_sizes = [0] * (max(shared_objects.keys()) + 1) + num_specs_processed = 0 for mem_id in shared_objects: input_total_size = 0 if bufsizes := getattr(graph_module, "input_mem_buffer_sizes", None): @@ -594,13 +708,18 @@ def greedy( total_sizes[mem_id] = materialize_buffer( shared_objects[mem_id], input_total_size ) - - # Since we now know the number of shared objects we need and the size of - # each shared object, we can assign offset in the memory buffer for each - # shared object. - for spec, sobj in spec2obj.items(): - spec.mem_obj_id = sobj.idx - spec.mem_offset = sobj.offset + # Since we now know the number of shared objects we need and the size of + # each shared object, we can assign offset in the memory buffer for each + # shared object. + for sobj in shared_objects[mem_id]: + for alloc in sobj.allocations: + spec = alloc.spec + alloc.spec.mem_obj_id = sobj.idx + alloc.spec.mem_offset = sobj.offset + alloc.offset + num_specs_processed += 1 + assert ( + len(spec2obj) == num_specs_processed + ), f"All specs should be processed but there were {len(spec2obj)} specs and processed {num_specs_processed} specs" logging.debug(f"greedy algorithm returns bufsizes: {total_sizes}") return total_sizes diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py index 112b8f5fc52..2188aed945a 100644 --- a/exir/passes/memory_planning_pass.py +++ b/exir/passes/memory_planning_pass.py @@ -127,4 +127,5 @@ def run( f"The {getattr(self.memory_planning_algo, '__name__', repr(self.memory_planning_algo))} algorithm reuses storage for {num_reuse_pairs} pair of tensors" ) verifier.verify_graph_input_output() + verifier.verify_storage_reuse() return PassResult(graph_module, True) diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py index 1f94f0341f1..e63f790c30e 100644 --- a/exir/tests/test_memory_planning.py +++ b/exir/tests/test_memory_planning.py @@ -106,6 +106,28 @@ def get_random_inputs(self) -> Tuple[torch.Tensor, ...]: return (torch.randn(2),) +class LinearsWithDifferentSizeAndViewOps(torch.nn.Module): + def __init__(self) -> None: + super(LinearsWithDifferentSizeAndViewOps, self).__init__() + self.linears = torch.nn.ModuleList() + for x in [8, 16, 32, 64]: + self.linears.append(torch.nn.Linear(x, x * 2)) + + def forward(self, i: torch.Tensor) -> torch.Tensor: + o1 = i + for linear in self.linears: + o1 = linear(o1) + o1 = o1.view(-1, 64, 2) + o1 = o1 + 1 + o2 = i + for linear in self.linears: + o2 = linear(o2) + return o1.view(-1, 128) + o2 + + def get_random_inputs(self) -> Tuple[torch.Tensor, ...]: + return (torch.randn(3, 8),) + + class ModuleReturnTwo(nn.Module): def __init__(self) -> None: super(ModuleReturnTwo, self).__init__() @@ -360,6 +382,13 @@ def verify_overlap_placeholders( ], ) + test_linear_with_view: Callable[..., None] = maketest( + LinearsWithDifferentSizeAndViewOps, + criteria=[ + (greedy, True), + ], + ) + # greedy algorithm will reuse memory if we let the algorithm allocate # memory for both graph input and output. test_list_arg: Callable[..., None] = maketest( @@ -508,15 +537,26 @@ def test_multiple_pools( verifier.verify_graph_input_output() idx = 0 + reference_output = dict() + actual_output = dict() for node in graph_module.graph.nodes: if node.op == "placeholder" or ( node.op == "call_function" and node.target in (torch.ops.aten.add.out, torch.ops.aten.mul.out) ): mem_id, mem_offset = expected_allocs[idx] - self.assertEqual(node.meta["spec"].mem_id, mem_id) - self.assertEqual(node.meta["spec"].mem_offset, mem_offset) + actual_mem_id, actual_mem_offset = ( + node.meta["spec"].mem_id, + node.meta["spec"].mem_offset, + ) + if (mem_id, mem_offset) not in reference_output: + reference_output[(mem_id, mem_offset)] = 1 + actual_output[(actual_mem_id, actual_mem_offset)] = 1 + else: + reference_output[(mem_id, mem_offset)] += 1 + actual_output[(actual_mem_id, actual_mem_offset)] += 1 idx += 1 + self.assertEqual(reference_output, actual_output) self.assertEqual(graph_module.meta["non_const_buffer_sizes"], expected_bufsizes) def test_constants_not_memory_planned(self) -> None: From 816efe94bac45bd0fe88fceb6f104746d6a5ee59 Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Fri, 24 Jan 2025 10:29:02 -0800 Subject: [PATCH 2/3] Update on "[ET][Memory planning] Improve greedy memory planning." This diff replaces the old greedy algorithm. Older algorithm resulted in 35% worse compared to theoretical optimum. THis matter for long context even more since additional overhead can be few hundred MB. For example the theorical optimial for llama3_2 8B, 4-bit quantized modelw ith context length of 2k needs about 1G of memory. This theoretcial max can be observed by looking at the peaks in memory profile. Current agorithm resulted in about 1.6GB of planned memory. New algorithm reduce that to about 1.1G. Differential Revision: [D68448332](https://our.internmc.facebook.com/intern/diff/D68448332/) [ghstack-poisoned] --- exir/memory_planning.py | 20 +++++++++++++++++++- exir/passes/memory_planning_pass.py | 23 +++++++++++++++++++++-- exir/tests/test_joint_graph.py | 4 ++-- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/exir/memory_planning.py b/exir/memory_planning.py index f7cd6e7a6db..4c4574c9609 100644 --- a/exir/memory_planning.py +++ b/exir/memory_planning.py @@ -117,6 +117,17 @@ def storage_overlap(cls, lhs_spec: TensorSpec, rhs_spec: TensorSpec) -> bool: return has_overlap + @classmethod + def _debug_message_from_specs( + cls, lhs_spec: TensorSpec, rhs_spec: TensorSpec + ) -> str: + message = ( + f"lhs life time: {lhs_spec.lifetime}, rhs lifetime: {rhs_spec.lifetime} " + ) + message += f"lhs: mem_id {lhs_spec.mem_id} storage: {lhs_spec.mem_offset}, {lhs_spec.allocated_memory} " + message += f"rhs: mem_id {rhs_spec.mem_id} storage: {rhs_spec.mem_offset}, {rhs_spec.allocated_memory}" + return message + def verify_storage_reuse( self, allow_lifetime_and_storage_overlap: bool = False ) -> int: @@ -159,7 +170,7 @@ def verify_storage_reuse( lhs_spec, rhs_spec ): raise InternalError( - f"Unexpected storage overlap: lhs {lhs_spec}, rhs {rhs_spec}" + f"Unexpected storage overlap: {Verifier._debug_message_from_specs(lhs_spec, rhs_spec)}" ) # Check that each mem_obj_id is consistent with whether the tensors have @@ -708,6 +719,13 @@ def greedy( total_sizes[mem_id] = materialize_buffer( shared_objects[mem_id], input_total_size ) + # padding allocation with 64 bytes. + # this requirement really for XNNPACK backend which can access tensors + # for reading beyond the end of the tensor. This is done for performance + # optimizations in XNNPACK. + # While account for backend specific requirement is not the right choice + # in backend agnostic memory planning, we do it here for now. + total_sizes[mem_id] += 64 # Since we now know the number of shared objects we need and the size of # each shared object, we can assign offset in the memory buffer for each # shared object. diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py index 2188aed945a..1e086dd56a0 100644 --- a/exir/passes/memory_planning_pass.py +++ b/exir/passes/memory_planning_pass.py @@ -6,7 +6,8 @@ import logging import warnings -from typing import Callable, List, Optional +from typing import Any, Callable, List, Optional +from functools import partial import torch from executorch.exir.error import internal_assert @@ -24,6 +25,17 @@ from torch.export.exported_program import ExportGraphSignature +# copied from https://stackoverflow.com/questions/75582932/python-how-can-i-print-the-function-name-of-a-partial-function +def _callable_name(any_callable: Callable[..., Any]) -> str: + if isinstance(any_callable, partial): + return any_callable.func.__name__ + + try: + return any_callable.__name__ + except AttributeError: + return str(any_callable) + + class MemoryPlanningPass(PassBase): def __init__( self, @@ -127,5 +139,12 @@ def run( f"The {getattr(self.memory_planning_algo, '__name__', repr(self.memory_planning_algo))} algorithm reuses storage for {num_reuse_pairs} pair of tensors" ) verifier.verify_graph_input_output() - verifier.verify_storage_reuse() + if ( + callable(self.memory_planning_algo) + and _callable_name(self.memory_planning_algo) == "greedy" + ): + # Only verify storage reuse for greedy algorithm + # At the moment cadence backends memory planning fails this + # I dont know if that is a valid thing but if it is we should adjust verify_storage_reuse function + verifier.verify_storage_reuse() return PassResult(graph_module, True) diff --git a/exir/tests/test_joint_graph.py b/exir/tests/test_joint_graph.py index f3b6f0ed557..349fa92e826 100644 --- a/exir/tests/test_joint_graph.py +++ b/exir/tests/test_joint_graph.py @@ -84,13 +84,13 @@ def forward(self, x, y): et.executorch_program.execution_plan[0] .values[0] .val.allocation_info.memory_offset_low, - 0, + 96, ) self.assertEqual( et.executorch_program.execution_plan[0] .values[1] .val.allocation_info.memory_offset_low, - 48, + 224, ) loss = m(*example_inputs) From 3975a751e3161cd559e9cf58da8d029664d52980 Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Fri, 24 Jan 2025 14:49:36 -0800 Subject: [PATCH 3/3] Update on "[ET][Memory planning] Improve greedy memory planning." This diff replaces the old greedy algorithm. Older algorithm resulted in 35% worse compared to theoretical optimum. THis matter for long context even more since additional overhead can be few hundred MB. For example the theorical optimial for llama3_2 8B, 4-bit quantized modelw ith context length of 2k needs about 1G of memory. This theoretcial max can be observed by looking at the peaks in memory profile. Current agorithm resulted in about 1.6GB of planned memory. New algorithm reduce that to about 1.1G. Differential Revision: [D68448332](https://our.internmc.facebook.com/intern/diff/D68448332/) [ghstack-poisoned] --- backends/vulkan/vulkan_preprocess.py | 8 +++++++- exir/memory_planning.py | 19 ++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py index 6e406a10ba6..e0608bd94e4 100644 --- a/backends/vulkan/vulkan_preprocess.py +++ b/backends/vulkan/vulkan_preprocess.py @@ -6,6 +6,8 @@ # pyre-strict +from functools import partial + from typing import Any, Dict, final, List import executorch.backends.vulkan.utils as utils @@ -18,6 +20,9 @@ from executorch.backends.transforms.fuse_dequant_linear import FuseDequantLinearPass from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform +from executorch.exir.memory_planning import ( + greedy, +) from executorch.backends.vulkan._passes import ( insert_prepack_nodes, RemoveLocalScalarDenseOpsTransform, @@ -189,11 +194,12 @@ def preprocess( # noqa: C901 # Finally, apply dynamic shape passes and memory planning pass. These passes # must be applied only when the graph structure is finalized. + greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False) program = apply_passes( program, [ ConstraintBasedSymShapeEvalPass(), - MemoryPlanningPass(), + MemoryPlanningPass(memory_planning_algo=greedy_memory_planning), ], ) diff --git a/exir/memory_planning.py b/exir/memory_planning.py index 4c4574c9609..b1e32f1a2fa 100644 --- a/exir/memory_planning.py +++ b/exir/memory_planning.py @@ -547,7 +547,9 @@ def _find_max_overlapping_allocations_offset( def pick_shared_obj( - shared_objects: List[SharedObject], spec: TensorSpec + shared_objects: List[SharedObject], + spec: TensorSpec, + allow_overlapping_allocations: bool = True, ) -> SharedObject: r""" Pick the available shared object to which to assign this spec, @@ -611,7 +613,7 @@ def pick_shared_obj( picked.allocations.append(allocation_spec) break - if picked is None: + if picked is None and allow_overlapping_allocations: for sobj in shared_objects: max_offset = _find_max_overlapping_allocations_offset(sobj, spec) if max_offset > 0: @@ -673,7 +675,16 @@ def greedy( graph_signature: Optional[ExportGraphSignature] = None, alloc_graph_input: bool = True, alloc_graph_output: bool = True, + allow_overlapping_allocations: bool = True, ) -> List[int]: + r"""Greedy algorithm to allocate memory for tensors in the graph. + alloc_graph_input: If set to true, the algorithm will allocate memory for graph input. + alloc_graph_output: If set to true, the algorithm will allocate memory for graph output. + allow_overlapping_allocations: If set to true, allows for allocations that overlap + in their lifetime but are at different offsets in the storage. By default true. + This flag is added to allow for Vulkan to use MemoryPlanningPass with overlapping + allocations disabled + """ spec2obj = {} shared_objects = defaultdict(list) # Don't do assertion in collect_specs_from_nodes if we have already encountered @@ -699,7 +710,9 @@ def greedy( if spec.mem_id is None: spec.mem_id = 1 spec.realign(alignment) - spec2obj[spec] = pick_shared_obj(shared_objects[spec.mem_id], spec) + spec2obj[spec] = pick_shared_obj( + shared_objects[spec.mem_id], spec, allow_overlapping_allocations + ) if len(shared_objects) == 0: # Cannot find any tensor in the graph that needs to be allocated.