Merge pull request #131 from johnmarktaylor91/perf/postprocess-pipeline-optimizations

johnmarktaylor91 · web-flow · commit 69099e8c4d79 · 2026-03-09T11:03:05.000-04:00
diff --git a/torchlens/postprocess/__init__.py b/torchlens/postprocess/__init__.py
@@ -34,6 +34,7 @@
 
 from typing import TYPE_CHECKING, List
 
+import time
 import torch
 
 from ..utils.tensor_utils import safe_copy
@@ -99,84 +100,102 @@ def postprocess(
         _set_pass_finished(self)
         return
 
-    # Steps 1-3: Graph traversal (output nodes, ancestry, orphan removal)
-    with _vtimed(self, "Steps 1-3: Graph traversal"):
-        # Step 1: Add dedicated output nodes
+    _vprint(
+        self,
+        f"Postprocessing {len(self._raw_layer_labels_list):,} layers "
+        f"({len(self.buffer_layers):,} buffers)...",
+    )
+    _post_t0 = time.time() if getattr(self, "verbose", False) else 0
+
+    # Step 1: Add dedicated output nodes
+    with _vtimed(self, "  Step 1: Add output layers"):
         _add_output_layers(self, output_tensors, output_tensor_addresses)
 
-        # Step 2: Trace which nodes are ancestors of output nodes
+    # Step 2: Trace which nodes are ancestors of output nodes
+    with _vtimed(self, "  Step 2: Trace output ancestors"):
         _find_output_ancestors(self)
 
-        # Step 3: Remove orphan nodes, find nodes that don't terminate in output node
+    # Step 3: Remove orphan nodes, find nodes that don't terminate in output node
+    with _vtimed(self, "  Step 3: Remove orphan nodes"):
         _remove_orphan_nodes(self)
 
     # Step 4: Find min/max distance from input and output nodes.
     # Conditional: only runs when the user requested distance metadata.
     if self.mark_input_output_distances:
-        with _vtimed(self, "Step 4: Input/output distances"):
+        with _vtimed(self, "  Step 4: Input/output distances"):
             _mark_input_output_distances(self)
 
-    # Steps 5-7: Control flow (conditional branches, module fixing, buffers)
-    with _vtimed(self, "Steps 5-7: Control flow"):
-        # Step 5: Starting from terminal single boolean tensors, mark the conditional branches.
+    # Step 5: Starting from terminal single boolean tensors, mark the conditional branches.
+    with _vtimed(self, "  Step 5: Mark conditional branches"):
         _mark_conditional_branches(self)
 
-        # Step 6: Annotate the containing modules for all internally-generated tensors.
+    # Step 6: Annotate the containing modules for all internally-generated tensors.
+    with _vtimed(self, "  Step 6: Fix module containment"):
         _fix_modules_for_internal_tensors(self)
 
-        # Step 7: Fix the buffer passes and parent information.
+    # Step 7: Fix the buffer passes and parent information.
+    with _vtimed(self, "  Step 7: Fix buffer layers"):
         _fix_buffer_layers(self)
 
     # Step 8: Identify all loops, mark repeated layers.
     loop_desc = (
-        "Step 8: Loop detection (full)"
+        "  Step 8: Loop detection (full)"
         if self.detect_loops
-        else "Step 8: Loop detection (params only)"
+        else "  Step 8: Loop detection (params only)"
     )
     with _vtimed(self, loop_desc):
         if self.detect_loops:
             _detect_and_label_loops(self)
         else:
             _group_by_shared_params(self)
 
-    # Steps 9-12: Labeling (label mapping, final info, rename, cleanup)
-    with _vtimed(self, "Steps 9-12: Labeling"):
-        # Step 9: Go down tensor list, get the mapping from raw tensor names to final tensor names.
+    # Step 9: Go down tensor list, get the mapping from raw tensor names to final tensor names.
+    with _vtimed(self, "  Step 9: Map labels"):
         _map_raw_labels_to_final_labels(self)
 
-        # Step 10: Log final info for all layers
+    # Step 10: Log final info for all layers
+    with _vtimed(self, "  Step 10: Log final info"):
         _log_final_info_for_all_layers(self)
 
-        # Step 11: Rename all raw labels to final labels
+    # Step 11: Rename all raw labels to final labels
+    with _vtimed(self, "  Step 11: Rename labels"):
         _rename_model_history_layer_names(self)
         _trim_and_reorder_model_history_fields(self)
 
-        # Step 12: Remove unsaved layers, build lookup key mappings
+    # Step 12: Remove unsaved layers, build lookup key mappings
+    with _vtimed(self, "  Step 12: Build lookup keys"):
         _remove_unwanted_entries_and_log_remaining(self)
 
-    # Steps 13-18: Finalization
-    with _vtimed(self, "Steps 13-18: Finalization"):
-        # Step 13: Undecorate all saved tensors and remove saved grad_fns.
+    # Step 13: Undecorate all saved tensors and remove saved grad_fns.
+    with _vtimed(self, "  Step 13: Undecorate tensors"):
         _undecorate_all_saved_tensors(self)
 
-        # Step 14: Clear the cache after any tensor deletions for garbage collection purposes:
-        torch.cuda.empty_cache()
+    # Step 14: Clear the cache after any tensor deletions for garbage collection purposes.
+    torch.cuda.empty_cache()
 
-        # Step 15: Log time elapsed.
+    # Step 15: Log time elapsed.
+    with _vtimed(self, "  Step 15: Log timing"):
         _log_time_elapsed(self)
 
-        # Step 16: Populate ParamLog reverse mappings, linked params, num_passes, and gradient metadata.
+    # Step 16: Populate ParamLog reverse mappings, linked params, num_passes, and gradient metadata.
+    with _vtimed(self, "  Step 16: Finalize params"):
         _finalize_param_logs(self)
 
-        # Step 16.5: Build aggregate LayerLog objects from per-pass LayerPassLog entries.
+    # Step 16.5: Build aggregate LayerLog objects from per-pass LayerPassLog entries.
+    with _vtimed(self, "  Step 16.5: Build layer logs"):
         _build_layer_logs(self)
 
-        # Step 17: Build structured ModuleLog objects from raw module_* dicts.
+    # Step 17: Build structured ModuleLog objects from raw module_* dicts.
+    with _vtimed(self, "  Step 17: Build module logs"):
         _build_module_logs(self)
 
-        # Step 18: log the pass as finished, changing the ModelLog behavior to its user-facing version.
+    # Step 18: log the pass as finished, changing the ModelLog behavior to its user-facing version.
+    with _vtimed(self, "  Step 18: Mark pass finished"):
         _set_pass_finished(self)
 
+    if getattr(self, "verbose", False):
+        print(f"[torchlens] Postprocessing complete ({time.time() - _post_t0:.2f}s)")
+
 
 def postprocess_fast(self: "ModelLog") -> None:
     """Lightweight postprocessing for fast (second-pass) logging mode.
diff --git a/torchlens/postprocess/control_flow.py b/torchlens/postprocess/control_flow.py
@@ -310,9 +310,14 @@ def _fix_modules_for_internal_tensors(self) -> None:
     # Append module path suffix to operation_equivalence_type for ALL tensors.
     # This ensures loop detection (Step 8) treats same-function operations in
     # different modules as distinct equivalence types.
+    _module_str_cache = {}
     for layer in self:
-        module_str = "_".join([module_pass[0] for module_pass in layer.containing_modules])
-        layer.operation_equivalence_type += module_str
+        cm_key = tuple(layer.containing_modules)
+        if cm_key not in _module_str_cache:
+            _module_str_cache[cm_key] = "_".join(
+                [module_pass[0] for module_pass in layer.containing_modules]
+            )
+        layer.operation_equivalence_type += _module_str_cache[cm_key]
 
 
 def _fix_modules_for_single_internal_tensor(
diff --git a/torchlens/postprocess/finalization.py b/torchlens/postprocess/finalization.py
@@ -46,12 +46,14 @@ def _undecorate_all_saved_tensors(self) -> None:
         if layer_entry.activation is not None:
             tensors_to_undecorate.append(layer_entry.activation)
 
-        tensors_to_undecorate.extend(
-            get_vars_of_type_from_obj(layer_entry.captured_args, torch.Tensor, search_depth=2)
-        )
-        tensors_to_undecorate.extend(
-            get_vars_of_type_from_obj(layer_entry.captured_kwargs, torch.Tensor, search_depth=2)
-        )
+        if layer_entry.captured_args:
+            tensors_to_undecorate.extend(
+                get_vars_of_type_from_obj(layer_entry.captured_args, torch.Tensor, search_depth=2)
+            )
+        if layer_entry.captured_kwargs:
+            tensors_to_undecorate.extend(
+                get_vars_of_type_from_obj(layer_entry.captured_kwargs, torch.Tensor, search_depth=2)
+            )
 
     for t in tensors_to_undecorate:
         if hasattr(t, "tl_tensor_label_raw"):
@@ -321,7 +323,9 @@ class ModuleParamInfo(NamedTuple):
     buffer_layers: list
 
 
-def _build_module_param_info(self: "ModelLog", address: str, mbd: dict) -> ModuleParamInfo:
+def _build_module_param_info(
+    self: "ModelLog", address: str, mbd: dict, _buffer_layers_by_module: Optional[dict] = None
+) -> ModuleParamInfo:
     """Gather parameter counts, sizes, and buffer layers for a single module."""
     from ..data_classes.param_log import ParamAccessor
 
@@ -332,14 +336,17 @@ def _build_module_param_info(self: "ModelLog", address: str, mbd: dict) -> Modul
     m_num_frozen = mbd["module_nparams_frozen"].get(address, 0)
     m_fsize = sum(pl.memory for pl in module_param_dict.values())
 
-    module_buffer_layers = [
-        bl
-        for bl in self.buffer_layers
-        if bl in self.layer_dict_all_keys
-        and hasattr(self.layer_dict_all_keys[bl], "buffer_address")
-        and self.layer_dict_all_keys[bl].buffer_address is not None
-        and self.layer_dict_all_keys[bl].buffer_address.rsplit(".", 1)[0] == address
-    ]
+    if _buffer_layers_by_module is not None:
+        module_buffer_layers = list(_buffer_layers_by_module.get(address, []))
+    else:
+        module_buffer_layers = [
+            bl
+            for bl in self.buffer_layers
+            if bl in self.layer_dict_all_keys
+            and hasattr(self.layer_dict_all_keys[bl], "buffer_address")
+            and self.layer_dict_all_keys[bl].buffer_address is not None
+            and self.layer_dict_all_keys[bl].buffer_address.rsplit(".", 1)[0] == address
+        ]
 
     return ModuleParamInfo(
         module_params, m_num_params, m_num_trainable, m_num_frozen, m_fsize, module_buffer_layers
@@ -395,6 +402,15 @@ def _build_module_logs(self: "ModelLog") -> None:
         for _alias in _meta.get("all_addresses", [_primary_addr]):
             _metadata_by_alias[_alias] = _meta
 
+    # Pre-compute buffer layers grouped by parent module address (O6).
+    _buffer_layers_by_module = defaultdict(list)
+    for bl in self.buffer_layers:
+        if bl in self.layer_dict_all_keys:
+            bl_entry = self.layer_dict_all_keys[bl]
+            if hasattr(bl_entry, "buffer_address") and bl_entry.buffer_address is not None:
+                module_addr = bl_entry.buffer_address.rsplit(".", 1)[0]
+                _buffer_layers_by_module[module_addr].append(bl)
+
     # --- Build ModuleLogs for each submodule ---
     for address in mbd["module_addresses"]:
         meta = _metadata_by_alias.get(address, {})
@@ -425,7 +441,7 @@ def _build_module_logs(self: "ModelLog") -> None:
             all_module_addresses=all_addresses,
         )
         call_children_all, call_parent_addr = _resolve_call_hierarchy(passes)
-        param_info = _build_module_param_info(self, address, mbd)
+        param_info = _build_module_param_info(self, address, mbd, _buffer_layers_by_module)
 
         # address_children from metadata may have a different address prefix
         # when the metadata was captured for a shared module under a different
diff --git a/torchlens/postprocess/labeling.py b/torchlens/postprocess/labeling.py
@@ -22,7 +22,7 @@
 """
 
 import weakref
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
 from typing import TYPE_CHECKING
 
 from ..constants import MODEL_LOG_FIELD_ORDER, LAYER_PASS_LOG_FIELD_ORDER
@@ -491,7 +491,7 @@ def _trim_and_reorder_layer_entry_fields(layer_entry: LayerPassLog) -> None:
     Callable attributes (methods) are excluded from the reordered dict.
     """
     old_dict = layer_entry.__dict__
-    new_dir_dict = OrderedDict()
+    new_dir_dict = {}
     # First: fields in canonical order.
     for field in LAYER_PASS_LOG_FIELD_ORDER:
         if field in old_dict:
@@ -563,18 +563,11 @@ def _rename_model_history_layer_names(self) -> None:
 
     mla = self._module_build_data["module_layer_argnames"]
     for module_pass, arglist in mla.items():
-        inds_to_remove = set()
-        for a, arg in enumerate(arglist):
-            raw_name = mla[module_pass][a][0]
-            if raw_name not in self._raw_to_final_layer_labels:
-                inds_to_remove.add(a)
-                continue
-            new_name = self._raw_to_final_layer_labels[raw_name]
-            argname = mla[module_pass][a][1]
-            mla[module_pass][a] = (new_name, argname)
-        mla[module_pass] = [
-            mla[module_pass][i] for i in range(len(arglist)) if i not in inds_to_remove
-        ]
+        new_arglist = []
+        for raw_name, argname in arglist:
+            if raw_name in self._raw_to_final_layer_labels:
+                new_arglist.append((self._raw_to_final_layer_labels[raw_name], argname))
+        mla[module_pass] = new_arglist
 
 
 def _trim_and_reorder_model_history_fields(self) -> None:
@@ -584,7 +577,7 @@ def _trim_and_reorder_model_history_fields(self) -> None:
     Public fields listed in MODEL_LOG_FIELD_ORDER come first, followed by any
     private fields (starting with ``_``) not already in the order list.
     """
-    new_dir_dict = OrderedDict()
+    new_dir_dict = {}
     for field in MODEL_LOG_FIELD_ORDER:
         new_dir_dict[field] = getattr(self, field)
     # Preserve all remaining fields not in the canonical order (private/internal
diff --git a/torchlens/postprocess/loop_detection.py b/torchlens/postprocess/loop_detection.py
@@ -382,9 +382,17 @@ def union(x, y, uf=uf_parent):
             if rx != ry:
                 uf[rx] = ry
 
-        for member1, member2 in it.combinations(members, 2):
-            if member_neighbor_isos[member1] & member_neighbor_isos[member2]:
-                union(member1, member2)
+        # Reverse-index approach: union members sharing a neighbor key.
+        # O(members × avg_neighbors) instead of O(members²).
+        _reverse_index = defaultdict(list)
+        for member_label in members:
+            for neighbor_key in member_neighbor_isos[member_label]:
+                _reverse_index[neighbor_key].append(member_label)
+        for members_with_key in _reverse_index.values():
+            if len(members_with_key) > 1:
+                first = members_with_key[0]
+                for other in members_with_key[1:]:
+                    union(first, other)
 
         components = defaultdict(list)
         for member in members:
@@ -628,10 +636,16 @@ def _find_isomorphic_matches(
 
     # Remove collisions: if the same node appears in multiple (node, subgraph) tuples,
     # discard all occurrences to avoid assigning one node to multiple subgraphs.
-    node_labels = [node[0] for node in new_equivalent_nodes]
-    new_equivalent_nodes = [
-        node for node in new_equivalent_nodes if node_labels.count(node[0]) == 1
-    ]
+    _seen_labels = set()
+    _dupe_labels = set()
+    for node in new_equivalent_nodes:
+        if node[0] in _seen_labels:
+            _dupe_labels.add(node[0])
+        _seen_labels.add(node[0])
+    if _dupe_labels:
+        new_equivalent_nodes = [
+            node for node in new_equivalent_nodes if node[0] not in _dupe_labels
+        ]
     return new_equivalent_nodes
 
 
@@ -776,26 +790,31 @@ def _union(x: str, y: str) -> None:
     for iso_nodes_orig in iso_node_groups.values():
         all_iso_nodes.update(iso_nodes_orig)
 
+    # Pre-compute param types per subgraph for O(1) lookup in the pair loop (O10).
+    _sg_param_types: Dict[str, frozenset] = {}
+    for iso_nodes_orig in iso_node_groups.values():
+        for node_label in iso_nodes_orig:
+            sg = node_to_subgraph[node_label]
+            sg_label = sg.starting_node
+            if sg_label not in _sg_param_types:
+                _sg_param_types[sg_label] = frozenset(
+                    self[pnode].operation_equivalence_type for pnode in sg.param_nodes
+                )
+
     # PASS 1: Within iso-groups — merge nodes whose subgraphs share param types or are adjacent.
     for iso_group_label, iso_nodes_orig in iso_node_groups.items():
         iso_nodes = sorted(iso_nodes_orig)
         for node1_label, node2_label in it.combinations(iso_nodes, 2):
-            node1_subgraph = node_to_subgraph[node1_label]
-            node2_subgraph = node_to_subgraph[node2_label]
-            node1_subgraph_label = node1_subgraph.starting_node
-            node2_subgraph_label = node2_subgraph.starting_node
-            node1_param_types = [
-                self[pnode].operation_equivalence_type for pnode in node1_subgraph.param_nodes
-            ]
-            node2_param_types = [
-                self[pnode].operation_equivalence_type for pnode in node2_subgraph.param_nodes
-            ]
-            overlapping_param_types = set(node1_param_types).intersection(set(node2_param_types))
+            node1_subgraph_label = node_to_subgraph[node1_label].starting_node
+            node2_subgraph_label = node_to_subgraph[node2_label].starting_node
+            overlapping_param_types = (
+                _sg_param_types[node1_subgraph_label] & _sg_param_types[node2_subgraph_label]
+            )
             subgraphs_are_adjacent = (
                 node1_subgraph_label in adjacent_subgraphs
                 and node2_subgraph_label in adjacent_subgraphs[node1_subgraph_label]
             )
-            if (len(overlapping_param_types) > 0) or subgraphs_are_adjacent:
+            if overlapping_param_types or subgraphs_are_adjacent:
                 _union(node1_label, node2_label)
 
     # PASS 2: Cross iso-groups — unconditionally merge by (func, params) identity.