pytorch
diff --git a/‎.github/workflows/_android.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/_android.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 13 additions & 13 deletions b/‎CONTRIBUTING.md‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎Package.swift‎
Lines changed: 1 addition & 1 deletion b/‎Package.swift‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README-wheel.md‎
Lines changed: 9 additions & 20 deletions b/‎README-wheel.md‎
Lines changed: 9 additions & 20 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎backends/apple/mps/setup.md‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/mps/setup.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 97 additions & 0 deletions b/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 97 additions & 0 deletions
@@ -131,7 +131,8 @@ jobs:
           # https://github.com/ReactiveCircus/android-emulator-runner. The max number
           # of cores we can set is 6, any higher number will be reduced to 6.
           cores: 6
-          ram-size: 12288M
+          ram-size: 16384M
+          heap-size: 12288M
           force-avd-creation: false
           disable-animations: true
           emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none
 
@@ -5,10 +5,10 @@ it easy to contribute to this project.
 ## Dev Install
 
 Set up your environment by following the instructions at
-https://pytorch.org/executorch/stable/getting-started-setup.html to clone
+https://pytorch.org/executorch/main/getting-started-setup to clone
 the repo and install the necessary requirements.
 
-Refer to this [document](https://pytorch.org/executorch/main/using-executorch-building-from-source.html) to build ExecuTorch from source.
+Refer to this [document](docs/source/using-executorch-building-from-source.md) to build ExecuTorch from source.
 
 ### Dev Setup for Android
 For Android, please refer to the [Android documentation](docs/source/using-executorch-android.md).
@@ -40,8 +40,8 @@ executorch
 ├── <a href="devtools">devtools</a> - Model profiling, debugging, and inspection. Please refer to the <a href="docs/source/devtools-overview.md">tools documentation</a> for more information.
 │   ├── <a href="devtools/bundled_program">bundled_program</a> - a tool for validating ExecuTorch model. See <a href="docs/source/bundled-io.md">doc</a>.
 │   ├── <a href="devtools/etdump">etdump</a> - ETDump - a format for saving profiling and debugging data from runtime. See <a href="docs/source/etdump.md">doc</a>.
-│   ├── <a href="devtools/etrecord">etrecord</a> - ETRecord - AOT debug artifact for ExecuTorch. See <a href="https://pytorch.org/executorch/main/etrecord.html">doc</a>.
-│   ├── <a href="devtools/inspector">inspector</a> - Python API to inspect ETDump and ETRecord. See <a href="https://pytorch.org/executorch/main/model-inspector.html">doc</a>.
+│   ├── <a href="devtools/etrecord">etrecord</a> - ETRecord - AOT debug artifact for ExecuTorch. See <a href="https://pytorch.org/executorch/main/etrecord">doc</a>.
+│   ├── <a href="devtools/inspector">inspector</a> - Python API to inspect ETDump and ETRecord. See <a href="https://pytorch.org/executorch/main/model-inspector">doc</a>.
 │   └── <a href="devtools/visualization">visualization</a> - Visualization tools for representing model structure and performance metrics.
 ├── <a href="docs">docs</a> - Static docs tooling and documentation source files.
 ├── <a href="examples">examples</a> - Examples of various user flows, such as model export, delegates, and runtime execution.
@@ -57,8 +57,8 @@ executorch
 │   ├── <a href="exir/serde">serde</a> - Graph module serialization/deserialization.
 │   ├── <a href="exir/verification">verification</a> - IR verification.
 ├── <a href="extension">extension</a> - Extensions built on top of the runtime.
-│   ├── <a href="extension/android">android</a> - ExecuTorch wrappers for Android apps. Please refer to the <a href="docs/source/using-executorch-android.md">Android documentation</a> and <a href="https://pytorch.org/executorch/main/javadoc/">Javadoc</a> for more information.
-│   ├── <a href="extension/apple">apple</a> - ExecuTorch wrappers for iOS apps. Please refer to the <a href="docs/source/using-executorch-ios.md">iOS documentation</a> and <a href="https://pytorch.org/executorch/stable/apple-runtime.html">how to integrate into Apple platform</a> for more information.
+│   ├── <a href="extension/android">android</a> - ExecuTorch wrappers for Android apps. Please refer to the <a href="docs/source/using-executorch-android.md">Android documentation</a> and <a href="https://pytorch.org/executorch/main/javadoc">Javadoc</a> for more information.
+│   ├── <a href="extension/apple">apple</a> - ExecuTorch wrappers for iOS apps. Please refer to the <a href="docs/source/using-executorch-ios.md">iOS documentation</a> on how to integrate into Apple platform</a> for more information.
 │   ├── <a href="extension/aten_util">aten_util</a> - Converts to and from PyTorch ATen types.
 │   ├── <a href="extension/data_loader">data_loader</a> - 1st party data loader implementations.
 │   ├── <a href="extension/evalue_util">evalue_util</a> - Helpers for working with EValue objects.
@@ -68,10 +68,10 @@ executorch
 │   ├── <a href="extension/memory_allocator">memory_allocator</a> - 1st party memory allocator implementations.
 │   ├── <a href="extension/module">module</a> - A simplified C++ wrapper for the runtime. An abstraction that deserializes and executes an ExecuTorch artifact (.pte file). Refer to the <a href="docs/source/extension-module.md">module documentation</a> for more information.
 │   ├── <a href="extension/parallel">parallel</a> - C++ threadpool integration.
-│   ├── <a href="extension/pybindings">pybindings</a> - Python API for executorch runtime. This is powering up the <a href="https://pytorch.org/executorch/main/runtime-python-api-reference.html">runtime Python API</a> for ExecuTorch.
+│   ├── <a href="extension/pybindings">pybindings</a> - Python API for executorch runtime. This is powering up the <a href="docs/source/runtime-python-api-reference.md">runtime Python API</a> for ExecuTorch.
 │   ├── <a href="extension/pytree">pytree</a> - C++ and Python flattening and unflattening lib for pytrees.
 │   ├── <a href="extension/runner_util">runner_util</a> - Helpers for writing C++ PTE-execution tools.
-│   ├── <a href="extension/tensor">tensor</a> - Tensor maker and <code>TensorPtr</code>, details in <a href="/docs/source/extension-tensor.md">this documentation</a>. For how to use <code>TensorPtr</code> and <code>Module</code>, please refer to the <a href="/docs/source/using-executorch-cpp.md">"Using ExecuTorch with C++"</a> doc.
+│   ├── <a href="extension/tensor">tensor</a> - Tensor maker and <code>TensorPtr</code>, details in <a href="docs/source/extension-tensor.md">this documentation</a>. For how to use <code>TensorPtr</code> and <code>Module</code>, please refer to the <a href="docs/source/using-executorch-cpp.md">"Using ExecuTorch with C++"</a> doc.
 │   ├── <a href="extension/testing_util">testing_util</a> - Helpers for writing C++ tests.
 │   ├── <a href="extension/threadpool">threadpool</a> - Threadpool.
 │   └── <a href="extension/training">training</a> - Experimental libraries for on-device training.
@@ -85,7 +85,7 @@ executorch
 ├── <a href="runtime">runtime</a> - Core C++ runtime. These components are used to execute the ExecuTorch program. Please refer to the <a href="docs/source/runtime-overview.md">runtime documentation</a> for more information.
 │   ├── <a href="runtime/backend">backend</a> - Backend delegate runtime APIs.
 │   ├── <a href="runtime/core">core</a> - Core structures used across all levels of the runtime. Basic components such as <code>Tensor</code>, <code>EValue</code>, <code>Error</code> and <code>Result</code> etc.
-│   ├── <a href="runtime/executor">executor</a> - Model loading, initialization, and execution. Runtime components that execute the ExecuTorch program, such as <code>Program</code>, <code>Method</code>. Refer to the <a href="https://pytorch.org/executorch/main/executorch-runtime-api-reference.html">runtime API documentation</a> for more information.
+│   ├── <a href="runtime/executor">executor</a> - Model loading, initialization, and execution. Runtime components that execute the ExecuTorch program, such as <code>Program</code>, <code>Method</code>. Refer to the <a href="https://pytorch.org/executorch/main/executorch-runtime-api-reference">runtime API documentation</a> for more information.
 │   ├── <a href="runtime/kernel">kernel</a> - Kernel registration and management.
 │   └── <a href="runtime/platform">platform</a> - Layer between architecture specific code and portable C++.
 ├── <a href="schema">schema</a> - ExecuTorch PTE file format flatbuffer schemas.
@@ -102,7 +102,7 @@ executorch
 ## Contributing workflow
 We actively welcome your pull requests (PRs).
 
-If you're completely new to open-source projects, GitHub, or ExecuTorch, please see our [New Contributor Guide](./docs/source/new-contributor-guide.md) for a step-by-step walkthrough on making your first contribution. Otherwise, read on.
+If you're completely new to open-source projects, GitHub, or ExecuTorch, please see our [New Contributor Guide](docs/source/new-contributor-guide.md) for a step-by-step walkthrough on making your first contribution. Otherwise, read on.
 
 1. [Claim an issue](#claiming-issues), if present, before starting work. If an
    issue doesn't cover the work you plan to do, consider creating one to provide
@@ -245,7 +245,7 @@ modifications to the Google C++ style guide.
 
 ### C++ Portability Guidelines
 
-See also [Portable C++ Programming](/docs/source/portable-cpp-programming.md)
+See also [Portable C++ Programming](docs/source/portable-cpp-programming.md)
 for detailed advice.
 
 #### C++ language version
@@ -417,9 +417,9 @@ for basics.
 
 ## For Backend Delegate Authors
 
-- Use [this](/docs/source/backend-delegates-integration.md) guide when
+- Use [this](docs/source/backend-delegates-integration.md) guide when
   integrating your delegate with ExecuTorch.
-- Refer to [this](/docs/source/backend-delegates-dependencies.md) set of
+- Refer to [this](docs/source/backend-delegates-dependencies.md) set of
   guidelines when including a third-party depenency for your delegate.
 
 &nbsp;
 
@@ -15,7 +15,7 @@
 //
 // For details on building frameworks locally or using prebuilt binaries,
 // see the documentation:
-// https://pytorch.org/executorch/main/using-executorch-ios.html
+// https://pytorch.org/executorch/main/using-executorch-ios
 
 import PackageDescription
 
 
@@ -10,32 +10,21 @@ The `executorch` pip package is in beta.
 
 The prebuilt `executorch.runtime` module included in this package provides a way
 to run ExecuTorch `.pte` files, with some restrictions:
-* Only [core ATen
-  operators](https://pytorch.org/executorch/stable/ir-ops-set-definition.html)
-  are linked into the prebuilt module
-* Only the [XNNPACK backend
-  delegate](https://pytorch.org/executorch/main/native-delegates-executorch-xnnpack-delegate.html)
-  is linked into the prebuilt module.
-* \[macOS only] [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html)
-  and [MPS](https://pytorch.org/executorch/main/build-run-mps.html) backend
-  delegates are also linked into the prebuilt module.
+* Only [core ATen operators](docs/source/ir-ops-set-definition.md) are linked into the prebuilt module
+* Only the [XNNPACK backend delegate](docs/source/backends-xnnpack.md) is linked into the prebuilt module.
+* \[macOS only] [Core ML](docs/source/backends-coreml.md) and [MPS](docs/source/backends-mps.md) backend
+  are also linked into the prebuilt module.
 
-Please visit the [ExecuTorch website](https://pytorch.org/executorch/) for
+Please visit the [ExecuTorch website](https://pytorch.org/executorch) for
 tutorials and documentation. Here are some starting points:
-* [Getting
-  Started](https://pytorch.org/executorch/stable/getting-started-setup.html)
+* [Getting Started](https://pytorch.org/executorch/main/getting-started-setup)
   * Set up the ExecuTorch environment and run PyTorch models locally.
-* [Working with
-  local LLMs](https://pytorch.org/executorch/stable/llm/getting-started.html)
+* [Working with local LLMs](docs/source/llm/getting-started.md)
   * Learn how to use ExecuTorch to export and accelerate a large-language model
     from scratch.
-* [Exporting to
-  ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial.html)
+* [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial)
   * Learn the fundamentals of exporting a PyTorch `nn.Module` to ExecuTorch, and
     optimizing its performance using quantization and hardware delegation.
-* Running LLaMA on
-  [iOS](https://pytorch.org/executorch/stable/llm/llama-demo-ios.html) and
-  [Android](https://pytorch.org/executorch/stable/llm/llama-demo-android.html)
-  devices.
+* Running LLaMA on [iOS](docs/source/llm/llama-demo-ios) and [Android](docs/source/llm/llama-demo-android) devices.
   * Build and run LLaMA in a demo mobile app, and learn how to integrate models
     with your own apps.
@@ -1,5 +1,5 @@
 <div align="center">
-  <img src="./docs/source/_static/img/et-logo.png" alt="Logo" width="200">
+  <img src="docs/source/_static/img/et-logo.png" alt="Logo" width="200">
   <h1 align="center">ExecuTorch: A powerful on-device AI Framework</h1>
 </div>
 
@@ -8,7 +8,7 @@
   <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="Contributors"></a>
   <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="Stargazers"></a>
   <a href="https://discord.gg/Dh43CKSAdc"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
-  <a href="https://pytorch.org/executorch/stable/index.html"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
+  <a href="https://pytorch.org/executorch/main/index"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
   <hr>
 </div>
 
@@ -49,9 +49,9 @@ Key value propositions of ExecuTorch are:
 ## Getting Started
 To get started you can:
 
-- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index.html) to get things running locally and deploy a model to a device
-- Use this [Colab Notebook](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away
-- Jump straight into LLM use cases by following specific instructions for [Llama](./examples/models/llama/README.md) and [Llava](./examples/models/llava/README.md)
+- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index) to get things running locally and deploy a model to a device
+- Use this [Colab Notebook](https://pytorch.org/executorch/main/getting-started-setup#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away
+- Jump straight into LLM use cases by following specific instructions for [Llama](examples/models/llama/README.md) and [Llava](examples/models/llava/README.md)
 
 ## Feedback and Engagement
 
 
@@ -40,7 +40,7 @@ In order to be able to successfully build and run a model using the MPS backend
 
 ## Setting up Developer Environment
 
-***Step 1.*** Please finish tutorial [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup).
+***Step 1.*** Please finish tutorial [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup).
 
 ***Step 2.*** Install dependencies needed to lower MPS delegate:
 
 
@@ -2110,6 +2110,102 @@ def call_operator(
         return super().call_operator(op, args, kwargs, meta)
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=2))
+class ReplaceGeluWithApproximateGeluPass(ExportPass):
+    """
+    Replace the gelu op with an approximate gelu op. The approximate gelu op
+    is more efficient on DSP backends.
+    """
+
+    def call_operator(
+        self,
+        op,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in {
+            exir_ops.edge.aten.gelu.default,
+        }:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # compute the approximate gelu (0.7978845608028654 is sqrt(2 / pi))
+        # as 0.5 * x * (1 + torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3)))
+
+        # Get 0.5 * x
+        half = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (args[0], 0.5),
+            {},
+            meta,
+        )
+
+        scaled = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (args[0], 0.044715),
+            {},
+            meta,
+        )
+
+        # Get x^2 (note that we use mul.Tensor twice instead of pow.Tensor because
+        # it is much more efficient on DSP backends)
+        scaled_square = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (scaled, args[0]),
+            {},
+            meta,
+        )
+
+        # Get x^3
+        scaled_cubed = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (scaled_square, args[0]),
+            {},
+            meta,
+        )
+
+        # Get x + 0.044715 * x^3
+        inner_sum = super().call_operator(
+            exir_ops.edge.aten.add.Tensor,
+            (scaled_cubed, args[0]),
+            {},
+            meta,
+        )
+
+        # Get 0.7978845608028654 * ( x + 0.044715 * x^3)
+        scaled_sum = super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (inner_sum, 0.7978845608028654),
+            {},
+            meta,
+        )
+
+        # Get torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3))
+        tanh = super().call_operator(
+            exir_ops.edge.aten.tanh.default,
+            (scaled_sum,),
+            {},
+            meta,
+        )
+
+        # Get 1 + torch.tanh(0.79788456 * ( x + 0.044715 * x^3))
+        # TODO(): Check why this is not working properly with integer values (e.g. 1 instead of 1.)
+        outer_sum = super().call_operator(
+            exir_ops.edge.aten.add.Tensor,
+            (tanh, 1.0),
+            {},
+            meta,
+        )
+
+        # Retunr the final result
+        return super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (half, outer_sum),
+            {},
+            meta,
+        )
+
+
 # This class encapsulates all the functions that replace/switch one op in the
 # graph with another.
 class CadenceReplaceOpsInGraph:
@@ -2149,4 +2245,5 @@ class CadenceReplaceOpsInGraph:
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
         ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
         ReplaceWhereWithFullArgsWithWhereScalar,
+        # ReplaceGeluWithApproximateGeluPass,
     ]