Merge branch 'main' into export-D84284541

Daksh-Shami · web-flow · commit 3a0b7cb5437d · 2025-10-09T18:14:11.000-04:00
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
@@ -111,6 +111,9 @@ file(GLOB_RECURSE vulkan_runtime_utils_cpp ${RUNTIME_PATH}/utils/*.cpp)
 
 # vulkan_backend
 
+# Try to find boost to log stack traces when throwing exceptions
+find_package(Boost 1.89 COMPONENTS stacktrace_basic stacktrace_addr2line)
+
 file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp)
 list(APPEND vulkan_backend_cpp ${vulkan_graph_cpp})
 list(APPEND vulkan_backend_cpp ${vulkan_standard_shaders_cpp})
@@ -121,6 +124,14 @@ target_include_directories(
   vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR} ${COMMON_INCLUDES}
 )
 target_link_libraries(vulkan_backend PRIVATE vulkan_schema executorch_core)
+# Optionally link boost for stacktraces if boost is available
+if(DEFINED Boost_STACKTRACE_BASIC_LIBRARY)
+  target_link_libraries(
+    vulkan_backend PRIVATE ${Boost_STACKTRACE_LIBRARY}
+                           ${Boost_STACKTRACE_ADDR2LINE_LIBRARY}
+  )
+  list(APPEND VULKAN_CXX_FLAGS "-DETVK_BOOST_STACKTRACE_AVAILABLE")
+endif()
 target_compile_options(vulkan_backend PRIVATE ${VULKAN_CXX_FLAGS})
 # Link this library with --whole-archive due to dynamic backend registration
 executorch_target_link_options_shared_lib(vulkan_backend)
diff --git a/backends/vulkan/runtime/vk_api/Exception.cpp b/backends/vulkan/runtime/vk_api/Exception.cpp
@@ -10,6 +10,13 @@
 
 #include <sstream>
 
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif // _GNU_SOURCE
+#include <boost/stacktrace.hpp>
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
+
 namespace vkcompute {
 namespace vkapi {
 
@@ -65,6 +72,11 @@ Error::Error(SourceLocation source_location, std::string msg)
   std::ostringstream oss;
   oss << "Exception raised from " << source_location_ << ": ";
   oss << msg_;
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+  oss << "\n";
+  oss << "Stack trace:\n";
+  oss << boost::stacktrace::stacktrace();
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
   what_ = oss.str();
 }
 
@@ -74,6 +86,11 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
   oss << "Exception raised from " << source_location_ << ": ";
   oss << "(" << cond << ") is false! ";
   oss << msg_;
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+  oss << "\n";
+  oss << "Stack trace:\n";
+  oss << boost::stacktrace::stacktrace();
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
   what_ = oss.str();
 }
 
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
@@ -19,6 +19,8 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
     default_flags = []
     android_flags = []
 
+    debug_mode = read_config("etvk", "debug", "0") == "1"
+
     if not no_volk:
         for flags in [default_flags, android_flags]:
             flags.append("-DUSE_VULKAN_WRAPPER")
@@ -32,6 +34,10 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
         if link_moltenvk:
             mac_flags = []
 
+        if debug_mode:
+            mac_flags.append("-DETVK_BOOST_STACKTRACE_AVAILABLE")
+            default_flags.append("-DETVK_BOOST_STACKTRACE_AVAILABLE")
+
         VK_API_PREPROCESSOR_FLAGS += select({
             "DEFAULT": default_flags,
             "ovr_config//os:android": android_flags,
@@ -59,7 +65,6 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
         if etvk_default_cache_path != "":
             VK_API_PREPROCESSOR_FLAGS += ["-DETVK_DEFAULT_CACHE_PATH={}".format(etvk_default_cache_path)]
 
-        debug_mode = read_config("etvk", "debug", "0") == "1"
         if debug_mode:
             VK_API_PREPROCESSOR_FLAGS += ["-DVULKAN_DEBUG"]
 
@@ -136,6 +141,8 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False, no_volk = Fal
     )
 
 def define_common_targets(is_fbcode = False):
+    debug_mode = read_config("etvk", "debug", "0") == "1"
+
     runtime.python_library(
         name = "gen_vulkan_spv_lib",
         srcs = [
@@ -200,6 +207,10 @@ def define_common_targets(is_fbcode = False):
                     "//third-party/khronos:moltenVK_static"
                 ]
 
+            if debug_mode:
+                mac_deps.append("fbsource//third-party/boost:boost")
+                default_deps.append("fbsource//third-party/boost:boost")
+
             VK_API_DEPS += select({
                 "DEFAULT": default_deps,
                 "ovr_config//os:android": android_deps,
diff --git a/docs/README.md b/docs/README.md
@@ -43,7 +43,7 @@ To build the documentation locally:
    git clone -b viable/strict https://github.com/pytorch/executorch.git && cd executorch
    ```
 
-1. If you don't have it already, start either a Python virtual envitonment:
+1. If you don't have it already, start either a Python virtual environment:
 
    ```bash
    python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
@@ -111,7 +111,7 @@ You can use the variables in both regular text and code blocks.
 ## Including READMEs to the Documentation Build
 
 You might want to include some of the `README.md` files from various directories
-in this repositories in your documentation build. To do that, create an `.md`
+in this repository in your documentation build. To do that, create an `.md`
 file and use the `{include}` directive to insert your `.md` files. Example:
 
 ````
@@ -177,7 +177,7 @@ file:
 ````
 
 In the `index.md` file, I would add `tutorials/selective-build-tutorial` in
-both the `toctree` and the `cusotmcarditem` sections.
+both the `toctree` and the `customcarditem` sections.
 
 # Auto-generated API documentation
 
diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md
@@ -61,7 +61,7 @@ The Core ML partitioner API allows for configuration of the model delegation to
  - `skip_ops_for_coreml_delegation`: Allows you to skip ops for delegation by Core ML.  By default, all ops that Core ML supports will be delegated.  See [here](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/apple/coreml/test/test_coreml_partitioner.py#L42) for an example of skipping an op for delegation.
 - `compile_specs`: A list of `CompileSpec`s for the Core ML backend.  These control low-level details of Core ML delegation, such as the compute unit (CPU, GPU, ANE), the iOS deployment target, and the compute precision (FP16, FP32).  These are discussed more below.
 - `take_over_mutable_buffer`: A boolean that indicates whether PyTorch mutable buffers in stateful models should be converted to [Core ML `MLState`](https://developer.apple.com/documentation/coreml/mlstate).  If set to `False`, mutable buffers in the PyTorch graph are converted to graph inputs and outputs to the Core ML lowered module under the hood.  Generally, setting `take_over_mutable_buffer` to true will result in better performance, but using `MLState` requires iOS >= 18.0, macOS >= 15.0, and Xcode >= 16.0.
-- `take_over_constant_data`: A boolean that indicates whether PyTorch constant data like model weights should be consumed by the Core ML delegate.  If set to False, constant data is passed to the Core ML delegate as inputs.  By deafault, take_over_constant_data=True.
+- `take_over_constant_data`: A boolean that indicates whether PyTorch constant data like model weights should be consumed by the Core ML delegate.  If set to False, constant data is passed to the Core ML delegate as inputs.  By default, take_over_constant_data=True.
 - `lower_full_graph`: A boolean that indicates whether the entire graph must be lowered to Core ML.  If set to True and Core ML does not support an op, an error is raised during lowering.  If set to False and Core ML does not support an op, the op is executed on the CPU by ExecuTorch.  Although setting `lower_full_graph`=False can allow a model to lower where it would otherwise fail, it can introduce performance overhead in the model when there are unsupported ops.  You will see warnings about unsupported ops during lowering if there are any.  By default, `lower_full_graph`=False.
 
 
diff --git a/docs/source/backends-overview.md b/docs/source/backends-overview.md
@@ -31,7 +31,7 @@ Backends are the bridge between your exported model and the hardware it runs on.
 | [OpenVINO](build-run-openvino)           | Embedded            | CPU/GPU/NPU   | Intel  SoCs                     |
 | [NXP](backends-nxp)                      | Embedded            | NPU           | NXP SoCs                        |
 | [Cadence](backends-cadence)              | Embedded            | DSP           | DSP-optimized workloads         |
-| [Samsung Exynos](backends-samsung-exynos)| Android             | NPU           | Samsung Socs                    |
+| [Samsung Exynos](backends-samsung-exynos)| Android             | NPU           | Samsung SoCs                    |
 
 **Tip:** For best performance, export a `.pte` file for each backend you plan to support.
 
diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md
@@ -82,7 +82,7 @@ To perform 8-bit quantization with the PT2E flow, perform the following steps pr
 1) Create an instance of the `XnnpackQuantizer` class. Set quantization parameters.
 2) Use `torch.export.export` to prepare for quantization.
 3) Call `prepare_pt2e` to prepare the model for quantization.
-4) For static quantization, run the prepared model with representative samples to calibrate the quantizated tensor activation ranges.
+4) For static quantization, run the prepared model with representative samples to calibrate the quantized tensor activation ranges.
 5) Call `convert_pt2e` to quantize the model.
 6) Export and lower the model using the standard flow.
 
diff --git a/docs/source/devtools-overview.md b/docs/source/devtools-overview.md
@@ -41,6 +41,6 @@ More details are available in the [ETDump documentation](etdump.md) on how to ge
 
 
 ### Inspector APIs
-The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
+The Inspector Python APIs are the main user entry point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
 
 More details are available in the [Inspector API documentation](model-inspector.rst) on how to use the Inspector APIs.
diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md
@@ -89,6 +89,6 @@ _Executor_ is the entry point to load the program and execute it. The execution
 
 ## Developer Tools
 
-It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
+It should be efficient for users to go from research to production using the flow above. Productivity is especially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
 
 During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments.
diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md
@@ -89,7 +89,7 @@ input_tensor: torch.Tensor = torch.randn(1, 3, 224, 224)
 program = runtime.load_program("model.pte")
 method = program.load_method("forward")
 output: List[torch.Tensor] = method.execute([input_tensor])
-print("Run succesfully via executorch")
+print("Run successfully via executorch")
 
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 import torchvision.models as models
diff --git a/docs/source/intro-how-it-works.md b/docs/source/intro-how-it-works.md
@@ -6,7 +6,7 @@ At a high-level, there are three steps for running a PyTorch model with ExecuTor
 
 1. **Export the model.** The first step is to capture the PyTorch program as a graph, which is a new representation of the model that can be expressed in terms of a series of operators such as addition, multiplication, or convolution. This process safely preserves the semantics of the original PyTorch program. This representation is the first step to enable running the model on edge use cases that have low memory and/or low compute.
 1. **Compile the exported model to an ExecuTorch program.** Given an exported model from step 1, convert it to an executable format called an ExecuTorch program that the runtime can use for inference. This step provides entry points for various optimizations such as compressing the model (e.g., quantization) to reduce size and further compiling subgraphs down to on-device specialized hardware accelerators to improve latency. It also provides an entry point for memory planning, i.e. to efficiently plan the location of intermediate tensors to reduce the runtime memory footprint.
-1. **Run the ExecuTorch program on a target device.** Given an input--such as an image represented as an input activation tensor--the ExecuTorch runtime loads the ExecuTorch program, executes the instructions represented by the program, and computes an output. This step is efficient because (1) the runtime is lightweight and (2) an efficient execution plan has already been calculated in steps 1 and 2, making it possible to do performant inference. Furthermore, portability of the core runtime enabled performant execution even on highly-constrained devices.
+1. **Run the ExecuTorch program on a target device.** Given an input--such as an image represented as an input activation tensor--the ExecuTorch runtime loads the ExecuTorch program, executes the instructions represented by the program, and computes an output. This step is efficient because (1) the runtime is lightweight and (2) an efficient execution plan has already been calculated in steps 1 and 2, making it possible to do performant inference. Furthermore, portability of the core runtime enables performant execution even on highly-constrained devices.
 
 This figure illustrates the three-step process of exporting a PyTorch program, compiling it into an ExecuTorch program that targets a specific hardware device, and finally executing the program on the device using the ExecuTorch runtime.
 ![name](_static/img/how-executorch-works-high-level.png)
diff --git a/docs/source/quantization-overview.md b/docs/source/quantization-overview.md
@@ -14,7 +14,7 @@ Quantization in ExecuTorch is backend-specific. Each backend defines how models
 The PT2E quantization workflow has three main steps:
 
 1. Configure a backend-specific quantizer.
-2. Prepare, calibrate, convert, and evalute the quantized model in PyTorch
+2. Prepare, calibrate, convert, and evaluate the quantized model in PyTorch
 3. Lower the model to the target backend
 
 ## 1. Configure a Backend-Specific Quantizer
diff --git a/docs/source/running-a-model-cpp-tutorial.md b/docs/source/running-a-model-cpp-tutorial.md
@@ -96,7 +96,7 @@ MemoryManager memory_manager(&method_allocator, &planned_memory);
 
 ## Loading a Method
 
-In ExecuTorch we load and initialize from the `Program` at a method granularity. Many programs will only have one method 'forward'. `load_method` is where initialization is done, from setting up tensor metadata, to intializing delegates, etc.
+In ExecuTorch we load and initialize from the `Program` at a method granularity. Many programs will only have one method 'forward'. `load_method` is where initialization is done, from setting up tensor metadata, to initializing delegates, etc.
 
 ``` cpp
 Result<Method> method = program->load_method(method_name);
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
@@ -72,7 +72,7 @@ curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250
 curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250412/executorch.aar.sha256sums
 ```
 
-We aim to make every daily snapshot available and useable. However, for best stability, please use releases, not snapshots.
+We aim to make every daily snapshot available and usable. However, for best stability, please use releases, not snapshots.
 
 ## Using AAR file
 
diff --git a/docs/source/using-executorch-troubleshooting.md b/docs/source/using-executorch-troubleshooting.md
@@ -1,11 +1,11 @@
 # Profiling and Debugging
 
-To faciliate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch.
+To facilitate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch.
 
 ## General Troubleshooting Steps
 
 - To troubleshoot failure of runtime API calls, such as loading or running a model, ensure that ExecuTorch framework logging is enabled. See [Logging](using-executorch-runtime-integration.md#logging) for more information.
-- As a prelimatinary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information.
+- As a preliminary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information.
 - Check [Frequently Asked Questions](using-executorch-faqs.md) for common issues and questions encountered during install, model export, and runtime integration.
 
 ## Developer Tools