Merge branch 'main' into shoumikhin-patch-1

shoumikhin · web-flow · commit 5dcb97189e88 · 2025-07-10T00:16:27.000-07:00
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-7cda4017ddda554752e89069ae205be5e8388f59
+9b498d3bb28b8e3411ce464dd2755c5b96d92c8f
diff --git a/.ci/scripts/check_c10_sync.sh b/.ci/scripts/check_c10_sync.sh
@@ -12,4 +12,4 @@ pushd pytorch
 git checkout "$pytorch_pin"
 popd
 "$(dirname "${BASH_SOURCE[0]}")"/compare_dirs.sh runtime/core/portable_type/c10/c10 pytorch/c10
-"$(dirname "${BASH_SOURCE[0]}")"/compare_dirs.sh runtime/core/portable_type/c10/torch/headeronly pytorch/torch/headeronly
+"$(dirname "${BASH_SOURCE[0]}")"/compare_dirs.sh runtime/core/portable_type/c10/torch/standalone pytorch/torch/standalone
diff --git a/devtools/inspector/_inspector.py b/devtools/inspector/_inspector.py
@@ -60,6 +60,7 @@
     is_debug_output,
     is_inference_output_equal,
     map_runtime_aot_intermediate_outputs,
+    merge_runtime_overlapping_debug_handles,
     ProgramOutput,
     RESERVED_FRAMEWORK_EVENT_NAMES,
     TimeScale,
@@ -1208,6 +1209,8 @@ def _get_runtime_intermediate_outputs_and_op_names(
                         event.debug_data,
                     )
                     debug_handle_to_op_name[debug_handle] = event.name
+
+        merge_runtime_overlapping_debug_handles(debug_handle_to_output)
         return {
             k: v[1] for k, v in debug_handle_to_output.items()
         }, debug_handle_to_op_name
@@ -1387,7 +1390,7 @@ def calculate_numeric_gap(self, distance: str = "MSE") -> pd.DataFrame:
         )
         if len(aot_intermediate_outputs) == 0 or len(aot_debug_handle_to_op_name) == 0:
             raise ValueError(
-                "calculate_numerical_gap error: The aot debug information is required but not populated"
+                "Missing etrecord or missing representative inputs within etrecord, both of which are required for calculating numerical gap"
             )
         # The runtime_op_names will be used later to map runtime debug_handle to op_name
         runtime_intermediate_outputs, runtime_debug_handle_to_op_name = (
diff --git a/devtools/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py
@@ -538,49 +538,71 @@ def compare_results(
     return results
 
 
-def merge_overlapping_debug_handles(
-    intermediate_outputs: Dict[DebugHandle, Any]
-) -> Dict[DebugHandle, Any]:
+def _merge_runtime_debug_handles(
+    debug_handle1: DebugHandle, debug_handle2: DebugHandle
+) -> DebugHandle:
     """
-    Merges overlapping debug handles into a single key in the dict.
-    For each debug handle, this function checks for overlaps with existing keys in the merged dict.
-    If overlaps are found, it combines the overlapping keys into a single key by taking the union of their elements.
-    The value associated with the merged key is determined by the debug handle with the highest last element.
+    Merge two DebugHandles by removing elements from debug_handle1 that are also present in debug_handle2,
+    while preserving the relative order of elements in both modified debug_handle1 and debug_handle2.
+    All elements from the modified debug_handle1 will appear before any elements from debug_handle2.
     """
 
-    if len(intermediate_outputs) == 0:
-        return {}
+    # Initialize a list to store unique elements in order
+    unique_ordered_list = []
+
+    # Initialize a set to track elements that have already been seen
+    seen = set(debug_handle2)
+
+    for item in debug_handle1:
+        # If the element has not been seen before, add it to the list and mark it as seen
+        if item not in seen:
+            unique_ordered_list.append(item)
 
-    merged: Dict[DebugHandle, Any] = {}
+    for item in debug_handle2:
+        unique_ordered_list.append(item)
+    return tuple(unique_ordered_list)
 
-    for debug_handle, value in intermediate_outputs.items():
-        debug_handle_set = set(debug_handle)
-        curr_debug_handle, last_value = debug_handle, value
 
-        # collect any existing keys that overlap with the current key
+def merge_runtime_overlapping_debug_handles(
+    intermediate_outputs: Dict[DebugHandle, Tuple[int, Any]]
+) -> Dict[DebugHandle, Tuple[int, Any]]:
+    """
+    Merges runtimes with overlapping debug handles into a single key in the dict.
+
+    For each debug handle, this function checks for overlaps with existing keys.
+    If overlaps are found, it combines the overlapping keys into a single key by taking
+    the union of their elements while maintaining the order. The order is preserved such that
+    higher instruction_id appears after the debug_handle with lower instruction_id.
+
+    The value associated with the merged key is determined by the debug handle with the highest instruction id.
+    """
+    if len(intermediate_outputs) == 0:
+        return {}
+    merged: Dict[DebugHandle, Tuple[int, Any]] = {}
+    for debug_handle, (instruction_id, debug_data) in intermediate_outputs.items():
+        curr_debug_handle, last_value = debug_handle, (instruction_id, debug_data)
+        # Collect any existing keys that overlap with the current key
         to_remove = []
         for existing_debug_handle, existing_value in merged.items():
-            if debug_handle_set.intersection(set(existing_debug_handle)):
-                # abosrb their ints
-                debug_handle_set |= set(existing_debug_handle)
-                if existing_debug_handle[-1] > curr_debug_handle[-1]:
-                    curr_debug_handle, last_value = (
-                        existing_debug_handle,
-                        existing_value,
+            if any(item in existing_debug_handle for item in debug_handle):
+                # Keep the value with the highest instruction_id
+                # Also merge the debug handles higher instruction_id
+                if existing_value[0] < instruction_id:
+                    curr_debug_handle = _merge_runtime_debug_handles(
+                        existing_debug_handle, curr_debug_handle
                     )
+                else:
+                    curr_debug_handle = _merge_runtime_debug_handles(
+                        curr_debug_handle, existing_debug_handle
+                    )
+                    last_value = existing_value
                 to_remove.append(existing_debug_handle)
-
-        # remove all the keys that overlap with the current key
+        # Remove all the keys that overlap with the current key
         for debug_handle in to_remove:
             merged.pop(debug_handle)
-
-        # add the current key to the merged one
-        new_debug_handle = tuple(sorted(debug_handle_set))
-        merged[new_debug_handle] = last_value
-
-    # Sort the merged debug handles in ascending order based on their last element
-    # TODO: Consider adding more logic to align the order with the execution order
-    return dict(sorted(merged.items(), key=lambda item: item[0][-1]))
+        # Add the current key to the merged one
+        merged[curr_debug_handle] = last_value
+    return merged
 
 
 def _debug_handles_have_overlap(
@@ -696,12 +718,6 @@ def map_runtime_aot_intermediate_outputs(
         Dict[Tuple[DebugHandle, Any], Tuple[DebugHandle, Any]] - Mapping
         from runtime intermediate output to AOT intermediate output
     """
-    # Merge overlapping debug handles
-    aot_intermediate_outputs = merge_overlapping_debug_handles(aot_intermediate_outputs)
-    runtime_intermediate_outputs = merge_overlapping_debug_handles(
-        runtime_intermediate_outputs
-    )
-
     # Create a graph(nodes and edges) of overlapping(between aot and runtime) debug handles
     nodes, edges = _create_debug_handle_overlap_graph(
         aot_intermediate_outputs, runtime_intermediate_outputs
diff --git a/devtools/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py
@@ -39,7 +39,7 @@
     get_aot_debug_handle_to_op_name_mapping,
     is_inference_output_equal,
     map_runtime_aot_intermediate_outputs,
-    merge_overlapping_debug_handles,
+    merge_runtime_overlapping_debug_handles,
     NodeFilter,
     TimeScale,
 )
@@ -228,44 +228,50 @@ def test_compare_results_uint8(self):
     def test_merge_overlapping_debug_handles_basic(self):
         big_tensor = torch.rand(100, 100)
         intermediate_outputs = {
-            (1, 2, 3): "val1",
-            (2, 3, 4, 5): "val2",
-            (6, 7, 8): "val3",
-            (10, 11): "val4",
-            (11, 12): big_tensor,
+            (1, 2, 3): (1, "val1"),
+            (2, 3, 4, 5): (2, "val2"),
+            (6, 7, 8): (3, "val3"),
+            (10, 11): (4, "val4"),
+            (11, 12): (5, big_tensor),
         }
         # basic merge behavior
-        intermediate_outputs = merge_overlapping_debug_handles(intermediate_outputs)
+        intermediate_outputs = merge_runtime_overlapping_debug_handles(
+            intermediate_outputs
+        )
         expected_intermediate_outputs = {
-            (1, 2, 3, 4, 5): "val2",
-            (6, 7, 8): "val3",
-            (10, 11, 12): big_tensor,
+            (1, 2, 3, 4, 5): (2, "val2"),
+            (6, 7, 8): (3, "val3"),
+            (10, 11, 12): (5, big_tensor),
         }
-
         self.assertEqual(intermediate_outputs, expected_intermediate_outputs)
-        self.assertIs(expected_intermediate_outputs[(10, 11, 12)], big_tensor)
+        self.assertIs(expected_intermediate_outputs[(10, 11, 12)][1], big_tensor)
 
     def test_merge_overlapping_debug_handles_non_continuous(self):
-        tensor1 = (torch.randn(3, 4),)
-        tensor2 = (torch.randn(2, 3),)
-        tensor3 = (torch.randn(4, 5),)
-        tensor4 = (torch.randn(6, 7),)
-        tensor5 = (torch.randn(8, 9),)
+        tensor1 = torch.randn(3, 4)
+        tensor2 = torch.randn(2, 3)
+        tensor3 = torch.randn(4, 5)
+        tensor4 = torch.randn(6, 7)
+        tensor5 = torch.randn(8, 9)
         intermediate_outputs = {
-            (1, 10): tensor1,
-            (2, 5): tensor2,
-            (1, 7, 9): tensor3,
-            (11, 13): tensor4,
-            (11, 15): tensor5,
+            (1, 10): (1, tensor1),
+            (2, 5): (2, tensor2),
+            (1, 7, 9): (3, tensor3),
+            (11, 13): (4, tensor4),
+            (11, 15): (5, tensor5),
         }
-        intermediate_outputs = merge_overlapping_debug_handles(intermediate_outputs)
+        intermediate_outputs = merge_runtime_overlapping_debug_handles(
+            intermediate_outputs
+        )
         expected_intermediate_outputs = {
-            (2, 5): tensor2,
-            (1, 7, 9, 10): tensor1,
-            (11, 13, 15): tensor5,
+            (2, 5): (2, tensor2),
+            (10, 1, 7, 9): (3, tensor3),
+            (13, 11, 15): (5, tensor5),
         }
 
-        self.assertEqual(intermediate_outputs, expected_intermediate_outputs)
+        for key in expected_intermediate_outputs:
+            expected_value = expected_intermediate_outputs[key][1]
+            actual_value = intermediate_outputs[key][1]
+            self.assertTrue(torch.allclose(expected_value, actual_value))
 
     def test_map_runtime_aot_intermediate_outputs_empty_inputs(self):
         # When the inputs are empty, the output should also be empty
diff --git a/install_requirements.py b/install_requirements.py
@@ -71,7 +71,7 @@ def python_is_compatible():
 #
 # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
 # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-NIGHTLY_VERSION = "dev20250706"
+NIGHTLY_VERSION = "dev20250625"
 
 
 def install_requirements(use_pytorch_nightly):
@@ -89,7 +89,7 @@ def install_requirements(use_pytorch_nightly):
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        f"torch==2.9.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
+        f"torch==2.8.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
     ]
 
     # Install the requirements for core ExecuTorch package.
@@ -151,7 +151,7 @@ def install_optional_example_requirements(use_pytorch_nightly):
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
         (
-            f"torchvision==0.24.0.{NIGHTLY_VERSION}"
+            f"torchvision==0.23.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
             else "torchvision"
         ),
diff --git a/runtime/core/portable_type/c10/README.md b/runtime/core/portable_type/c10/README.md
@@ -12,7 +12,7 @@ would cause all headers in that directory to be includeable with
 `runtime/core/portable_type/complex.h`, which would shadow the C99
 `complex.h` standard header.
 
-`torch/headeronly` has been added as an extra "even more bottom of
+`torch/standalone` has been added as an extra "even more bottom of
 stack" directory in PyTorch, so we have to add it to our sync
 here. The extra "stutter" c10 directory causing `c10/torch/standlone`
 is unfortunately awkward; perhaps we can rename the top-level
diff --git a/runtime/core/portable_type/c10/c10/macros/Export.h b/runtime/core/portable_type/c10/c10/macros/Export.h
@@ -5,7 +5,7 @@
 #include <c10/macros/cmake_macros.h>
 #endif // C10_USING_CUSTOM_GENERATED_MACROS
 
-#include <torch/headeronly/macros/Export.h>
+#include <torch/standalone/macros/Export.h>
 
 // This one is being used by libtorch.so
 #ifdef CAFFE2_BUILD_MAIN_LIB
diff --git a/runtime/core/portable_type/c10/c10/macros/Macros.h b/runtime/core/portable_type/c10/c10/macros/Macros.h
@@ -312,21 +312,7 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #endif
 
 #if defined(USE_ROCM)
-// C10_WARP_SIZE is only allowed for device code.
-// Host code _must_ use at::cuda::warp_size()
-// HIP header used to define warpSize as a constexpr that was either 32 or 64
-// depending on the target device, and then always set it to 64 for host code.
-// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
-// set it to something unreasonable to trigger obvious host code errors.
-#if defined(__HIP_DEVICE_COMPILE__)
-#if defined(__GFX9__)
-static constexpr int C10_WARP_SIZE = 64;
-#else // __GFX9__
-static constexpr int C10_WARP_SIZE = 32;
-#endif // __GFX9__
-#else
-static constexpr int C10_WARP_SIZE = 1;
-#endif // __HIP_DEVICE_COMPILE__
+#define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h)
 #else
 #define C10_WARP_SIZE 32
 #endif
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -125,7 +125,7 @@ def define_common_targets():
                 "@EXECUTORCH_CLIENTS",
             ],
             exported_deps = [
-                "//executorch/runtime/core/portable_type/c10/torch/headeronly:torch_headeronly",
+                "//executorch/runtime/core/portable_type/c10/torch/standalone:torch_standalone_headers",
             ] + select({
                 "DEFAULT": [],
                 # Half-inl.h depends on vec_half.h from ATen, but only when building for x86.
diff --git a/runtime/core/portable_type/c10/c10/util/irange.h b/runtime/core/portable_type/c10/c10/util/irange.h
@@ -24,7 +24,7 @@ struct integer_iterator {
   using pointer = I*;
   using reference = I&;
 
-  explicit constexpr integer_iterator(I val) : value(val) {}
+  explicit constexpr integer_iterator(I value) : value(value) {}
 
   constexpr I operator*() const {
     return value;
diff --git a/runtime/core/portable_type/c10/torch/standalone/TARGETS b/runtime/core/portable_type/c10/torch/standalone/TARGETS
diff --git a/runtime/core/portable_type/c10/torch/standalone/macros/Export.h b/runtime/core/portable_type/c10/torch/standalone/macros/Export.h
diff --git a/runtime/core/portable_type/c10/torch/standalone/targets.bzl b/runtime/core/portable_type/c10/torch/standalone/targets.bzl
@@ -8,7 +8,7 @@ def define_common_targets():
     """
 
     runtime.cxx_library(
-        name = "torch_headeronly",
+        name = "torch_standalone_headers",
         exported_headers = glob(["**/*.h"]),
-        header_namespace = "torch/headeronly",
+        header_namespace = "torch/standalone",
     )
diff --git a/runtime/executor/merged_data_map.h b/runtime/executor/merged_data_map.h
@@ -116,7 +116,7 @@ class MergedDataMap final : public NamedDataMap {
     ET_CHECK_OR_RETURN_ERROR(
         index >= 0 && index < total_num_keys,
         InvalidArgument,
-        "Index %u out of range of size %u",
+        "Index %" PRIu32 " out of range of size %" PRIu32,
         index,
         total_num_keys);
 
diff --git a/scripts/build_apple_frameworks.sh b/scripts/build_apple_frameworks.sh
@@ -180,7 +180,7 @@ sed -i '' '1i\
 ' \
 "$HEADERS_ABSOLUTE_PATH/executorch/runtime/core/portable_type/c10/c10/macros/Macros.h" \
 "$HEADERS_ABSOLUTE_PATH/executorch/runtime/core/portable_type/c10/c10/macros/Export.h" \
-"$HEADERS_ABSOLUTE_PATH/executorch/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h"
+"$HEADERS_ABSOLUTE_PATH/executorch/runtime/core/portable_type/c10/torch/standalone/macros/Export.h"
 
 cp -r $HEADERS_ABSOLUTE_PATH/executorch/runtime/core/portable_type/c10/c10 "$HEADERS_ABSOLUTE_PATH/"
 cp -r $HEADERS_ABSOLUTE_PATH/executorch/runtime/core/portable_type/c10/torch "$HEADERS_ABSOLUTE_PATH/"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-7cda4017ddda554752e89069ae205be5e8388f59`
	`1`	`+9b498d3bb28b8e3411ce464dd2755c5b96d92c8f`