Updated pre-processing runtime merging logic

Juntian Liu · web-flow · commit f63e2980781b · 2025-07-09T20:14:25.000-07:00
Differential Revision: D77905958 Pull Request resolved: #12302
diff --git a/devtools/inspector/_inspector.py b/devtools/inspector/_inspector.py
@@ -60,6 +60,7 @@
     is_debug_output,
     is_inference_output_equal,
     map_runtime_aot_intermediate_outputs,
+    merge_runtime_overlapping_debug_handles,
     ProgramOutput,
     RESERVED_FRAMEWORK_EVENT_NAMES,
     TimeScale,
@@ -1208,6 +1209,8 @@ def _get_runtime_intermediate_outputs_and_op_names(
                         event.debug_data,
                     )
                     debug_handle_to_op_name[debug_handle] = event.name
+
+        merge_runtime_overlapping_debug_handles(debug_handle_to_output)
         return {
             k: v[1] for k, v in debug_handle_to_output.items()
         }, debug_handle_to_op_name
@@ -1387,7 +1390,7 @@ def calculate_numeric_gap(self, distance: str = "MSE") -> pd.DataFrame:
         )
         if len(aot_intermediate_outputs) == 0 or len(aot_debug_handle_to_op_name) == 0:
             raise ValueError(
-                "calculate_numerical_gap error: The aot debug information is required but not populated"
+                "Missing etrecord or missing representative inputs within etrecord, both of which are required for calculating numerical gap"
             )
         # The runtime_op_names will be used later to map runtime debug_handle to op_name
         runtime_intermediate_outputs, runtime_debug_handle_to_op_name = (
diff --git a/devtools/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py
@@ -538,49 +538,71 @@ def compare_results(
     return results
 
 
-def merge_overlapping_debug_handles(
-    intermediate_outputs: Dict[DebugHandle, Any]
-) -> Dict[DebugHandle, Any]:
+def _merge_runtime_debug_handles(
+    debug_handle1: DebugHandle, debug_handle2: DebugHandle
+) -> DebugHandle:
     """
-    Merges overlapping debug handles into a single key in the dict.
-    For each debug handle, this function checks for overlaps with existing keys in the merged dict.
-    If overlaps are found, it combines the overlapping keys into a single key by taking the union of their elements.
-    The value associated with the merged key is determined by the debug handle with the highest last element.
+    Merge two DebugHandles by removing elements from debug_handle1 that are also present in debug_handle2,
+    while preserving the relative order of elements in both modified debug_handle1 and debug_handle2.
+    All elements from the modified debug_handle1 will appear before any elements from debug_handle2.
     """
 
-    if len(intermediate_outputs) == 0:
-        return {}
+    # Initialize a list to store unique elements in order
+    unique_ordered_list = []
+
+    # Initialize a set to track elements that have already been seen
+    seen = set(debug_handle2)
+
+    for item in debug_handle1:
+        # If the element has not been seen before, add it to the list and mark it as seen
+        if item not in seen:
+            unique_ordered_list.append(item)
 
-    merged: Dict[DebugHandle, Any] = {}
+    for item in debug_handle2:
+        unique_ordered_list.append(item)
+    return tuple(unique_ordered_list)
 
-    for debug_handle, value in intermediate_outputs.items():
-        debug_handle_set = set(debug_handle)
-        curr_debug_handle, last_value = debug_handle, value
 
-        # collect any existing keys that overlap with the current key
+def merge_runtime_overlapping_debug_handles(
+    intermediate_outputs: Dict[DebugHandle, Tuple[int, Any]]
+) -> Dict[DebugHandle, Tuple[int, Any]]:
+    """
+    Merges runtimes with overlapping debug handles into a single key in the dict.
+
+    For each debug handle, this function checks for overlaps with existing keys.
+    If overlaps are found, it combines the overlapping keys into a single key by taking
+    the union of their elements while maintaining the order. The order is preserved such that
+    higher instruction_id appears after the debug_handle with lower instruction_id.
+
+    The value associated with the merged key is determined by the debug handle with the highest instruction id.
+    """
+    if len(intermediate_outputs) == 0:
+        return {}
+    merged: Dict[DebugHandle, Tuple[int, Any]] = {}
+    for debug_handle, (instruction_id, debug_data) in intermediate_outputs.items():
+        curr_debug_handle, last_value = debug_handle, (instruction_id, debug_data)
+        # Collect any existing keys that overlap with the current key
         to_remove = []
         for existing_debug_handle, existing_value in merged.items():
-            if debug_handle_set.intersection(set(existing_debug_handle)):
-                # abosrb their ints
-                debug_handle_set |= set(existing_debug_handle)
-                if existing_debug_handle[-1] > curr_debug_handle[-1]:
-                    curr_debug_handle, last_value = (
-                        existing_debug_handle,
-                        existing_value,
+            if any(item in existing_debug_handle for item in debug_handle):
+                # Keep the value with the highest instruction_id
+                # Also merge the debug handles higher instruction_id
+                if existing_value[0] < instruction_id:
+                    curr_debug_handle = _merge_runtime_debug_handles(
+                        existing_debug_handle, curr_debug_handle
                     )
+                else:
+                    curr_debug_handle = _merge_runtime_debug_handles(
+                        curr_debug_handle, existing_debug_handle
+                    )
+                    last_value = existing_value
                 to_remove.append(existing_debug_handle)
-
-        # remove all the keys that overlap with the current key
+        # Remove all the keys that overlap with the current key
         for debug_handle in to_remove:
             merged.pop(debug_handle)
-
-        # add the current key to the merged one
-        new_debug_handle = tuple(sorted(debug_handle_set))
-        merged[new_debug_handle] = last_value
-
-    # Sort the merged debug handles in ascending order based on their last element
-    # TODO: Consider adding more logic to align the order with the execution order
-    return dict(sorted(merged.items(), key=lambda item: item[0][-1]))
+        # Add the current key to the merged one
+        merged[curr_debug_handle] = last_value
+    return merged
 
 
 def _debug_handles_have_overlap(
@@ -696,12 +718,6 @@ def map_runtime_aot_intermediate_outputs(
         Dict[Tuple[DebugHandle, Any], Tuple[DebugHandle, Any]] - Mapping
         from runtime intermediate output to AOT intermediate output
     """
-    # Merge overlapping debug handles
-    aot_intermediate_outputs = merge_overlapping_debug_handles(aot_intermediate_outputs)
-    runtime_intermediate_outputs = merge_overlapping_debug_handles(
-        runtime_intermediate_outputs
-    )
-
     # Create a graph(nodes and edges) of overlapping(between aot and runtime) debug handles
     nodes, edges = _create_debug_handle_overlap_graph(
         aot_intermediate_outputs, runtime_intermediate_outputs
diff --git a/devtools/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py
@@ -39,7 +39,7 @@
     get_aot_debug_handle_to_op_name_mapping,
     is_inference_output_equal,
     map_runtime_aot_intermediate_outputs,
-    merge_overlapping_debug_handles,
+    merge_runtime_overlapping_debug_handles,
     NodeFilter,
     TimeScale,
 )
@@ -228,44 +228,50 @@ def test_compare_results_uint8(self):
     def test_merge_overlapping_debug_handles_basic(self):
         big_tensor = torch.rand(100, 100)
         intermediate_outputs = {
-            (1, 2, 3): "val1",
-            (2, 3, 4, 5): "val2",
-            (6, 7, 8): "val3",
-            (10, 11): "val4",
-            (11, 12): big_tensor,
+            (1, 2, 3): (1, "val1"),
+            (2, 3, 4, 5): (2, "val2"),
+            (6, 7, 8): (3, "val3"),
+            (10, 11): (4, "val4"),
+            (11, 12): (5, big_tensor),
         }
         # basic merge behavior
-        intermediate_outputs = merge_overlapping_debug_handles(intermediate_outputs)
+        intermediate_outputs = merge_runtime_overlapping_debug_handles(
+            intermediate_outputs
+        )
         expected_intermediate_outputs = {
-            (1, 2, 3, 4, 5): "val2",
-            (6, 7, 8): "val3",
-            (10, 11, 12): big_tensor,
+            (1, 2, 3, 4, 5): (2, "val2"),
+            (6, 7, 8): (3, "val3"),
+            (10, 11, 12): (5, big_tensor),
         }
-
         self.assertEqual(intermediate_outputs, expected_intermediate_outputs)
-        self.assertIs(expected_intermediate_outputs[(10, 11, 12)], big_tensor)
+        self.assertIs(expected_intermediate_outputs[(10, 11, 12)][1], big_tensor)
 
     def test_merge_overlapping_debug_handles_non_continuous(self):
-        tensor1 = (torch.randn(3, 4),)
-        tensor2 = (torch.randn(2, 3),)
-        tensor3 = (torch.randn(4, 5),)
-        tensor4 = (torch.randn(6, 7),)
-        tensor5 = (torch.randn(8, 9),)
+        tensor1 = torch.randn(3, 4)
+        tensor2 = torch.randn(2, 3)
+        tensor3 = torch.randn(4, 5)
+        tensor4 = torch.randn(6, 7)
+        tensor5 = torch.randn(8, 9)
         intermediate_outputs = {
-            (1, 10): tensor1,
-            (2, 5): tensor2,
-            (1, 7, 9): tensor3,
-            (11, 13): tensor4,
-            (11, 15): tensor5,
+            (1, 10): (1, tensor1),
+            (2, 5): (2, tensor2),
+            (1, 7, 9): (3, tensor3),
+            (11, 13): (4, tensor4),
+            (11, 15): (5, tensor5),
         }
-        intermediate_outputs = merge_overlapping_debug_handles(intermediate_outputs)
+        intermediate_outputs = merge_runtime_overlapping_debug_handles(
+            intermediate_outputs
+        )
         expected_intermediate_outputs = {
-            (2, 5): tensor2,
-            (1, 7, 9, 10): tensor1,
-            (11, 13, 15): tensor5,
+            (2, 5): (2, tensor2),
+            (10, 1, 7, 9): (3, tensor3),
+            (13, 11, 15): (5, tensor5),
         }
 
-        self.assertEqual(intermediate_outputs, expected_intermediate_outputs)
+        for key in expected_intermediate_outputs:
+            expected_value = expected_intermediate_outputs[key][1]
+            actual_value = intermediate_outputs[key][1]
+            self.assertTrue(torch.allclose(expected_value, actual_value))
 
     def test_map_runtime_aot_intermediate_outputs_empty_inputs(self):
         # When the inputs are empty, the output should also be empty