pytorch
diff --git a/‎.github/workflows/stale‎
Lines changed: 149 additions & 0 deletions b/‎.github/workflows/stale‎
Lines changed: 149 additions & 0 deletions
diff --git a/‎backends/cadence/aot/functions.yaml‎
Lines changed: 15 additions & 0 deletions b/‎backends/cadence/aot/functions.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/aot/memory_planning.py‎
Lines changed: 8 additions & 5 deletions b/‎backends/cadence/aot/memory_planning.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎backends/cadence/aot/memory_planning_algo.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/aot/memory_planning_algo.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 46 additions & 0 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎backends/cadence/aot/tests/test_memory_passes.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/tests/test_memory_passes.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/tests/test_type_dispatch_passes.py‎
Lines changed: 50 additions & 0 deletions b/‎backends/cadence/aot/tests/test_type_dispatch_passes.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎backends/cadence/aot/type_dispatch.py‎
Lines changed: 8 additions & 0 deletions b/‎backends/cadence/aot/type_dispatch.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_clamp.cpp‎
Lines changed: 3 additions & 2 deletions b/‎backends/cadence/fusion_g3/operators/op_clamp.cpp‎
Lines changed: 3 additions & 2 deletions
@@ -0,0 +1,149 @@
+# The behavior is:
+# - If a PR is not labeled stale, after 60 days inactivity label the PR as stale and comment about it.
+# - If a PR is labeled stale, after 30 days inactivity close the PR.
+# - `high priority` and `no-stale` PRs are exempt.
+
+name: Close stale pull requests
+
+on:
+  schedule:
+    # Run daily at 00:30 UTC.
+    - cron: '30 0 * * *'
+  workflow_dispatch:
+
+jobs:
+  stale:
+    if: ${{ github.repository == 'pytorch/executorch' }}
+    runs-on: linux.large
+    permissions:
+      contents: read
+      pull-requests: write
+
+    steps:
+      - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        with:
+          script: |
+            // Do some dumb retries on requests.
+            const retries = 7;
+            const baseBackoff = 100;
+            const sleep = timeout => new Promise(resolve => setTimeout(resolve, timeout));
+            github.hook.wrap('request', async (request, options) => {
+              for (let attempt = 1; attempt <= retries; attempt++) {
+                try {
+                  return await request(options);
+                } catch (err) {
+                  if (attempt < retries) {
+                    core.warning(`Request getting retried. Attempt: ${attempt}`);
+                    await sleep(baseBackoff * Math.pow(2, attempt));
+                    continue;
+                  }
+                  throw err;
+                }
+              }
+            });
+
+            const MAX_API_REQUESTS = 100;
+
+            // If a PRs not labeled stale, label them stale after no update for 60 days.
+            const STALE_LABEL_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 60;
+            // For PRs already labeled stale, close after not update for 30 days.
+            const STALE_CLOSE_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 30;
+
+            const STALE_MESSAGE =
+              "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `Stale`. <br>" +
+              "Feel free to remove the `Stale` label if you feel this was a mistake. <br>" +
+              "If you are unable to remove the `Stale` label please contact a maintainer in order to do so. <br>" +
+              "If you want the bot to never mark this PR stale again, add the `no-stale` label.<br>" +
+              "`Stale` pull requests will automatically be closed after 30 days of inactivity.<br>";
+
+            let numAPIRequests = 0;
+            let numProcessed = 0;
+
+            async function processPull(pull) {
+              core.info(`[${pull.number}] URL: ${pull.html_url}`);
+              numProcessed += 1;
+              const labels = pull.labels.map((label) => label.name);
+
+              // Skip if certain labels are present.
+              if (labels.includes("no-stale") || labels.includes("high priority")) {
+                core.info(`[${pull.number}] Skipping because PR has an exempting label.`);
+                return false;
+              }
+
+              // Check if the PR is stale, according to our configured thresholds.
+              let staleThresholdMillis;
+              if (labels.includes("Stale")) {
+                core.info(`[${pull.number}] PR is labeled stale, checking whether we should close it.`);
+                staleThresholdMillis = STALE_CLOSE_THRESHOLD_MS;
+              } else {
+                core.info(`[${pull.number}] Checking whether to label PR as stale.`);
+                staleThresholdMillis = STALE_LABEL_THRESHOLD_MS;
+              }
+
+              const millisSinceLastUpdated =
+                new Date().getTime() - new Date(pull.updated_at).getTime();
+
+              if (millisSinceLastUpdated < staleThresholdMillis) {
+                core.info(`[${pull.number}] Skipping because PR was updated recently`);
+                return false;
+              }
+
+              // At this point, we know we should do something.
+              // For PRs already labeled stale, close them.
+              if (labels.includes("Stale")) {
+                core.info(`[${pull.number}] Closing PR.`);
+                numAPIRequests += 1;
+                  //await github.rest.issues.update({
+                  //owner: "pytorch",
+                  //repo: "executorch",
+                  //issue_number: pull.number,
+                  //state: "closed",
+                  //});
+              } else {
+                // For PRs not labeled stale, label them stale.
+                core.info(`[${pull.number}] Labeling PR as stale.`);
+
+                numAPIRequests += 1;
+                  //await github.rest.issues.createComment({
+                  //owner: "pytorch",
+                  //repo: "executorch",
+                  //issue_number: pull.number,
+                  //body: STALE_MESSAGE,
+                //});
+
+                numAPIRequests += 1;
+                  //await github.rest.issues.addLabels({
+                  //owner: "pytorch",
+                  //repo: "executorch",
+                  //issue_number: pull.number,
+                  //labels: ["Stale"],
+                //});
+              }
+            }
+
+            for await (const response of github.paginate.iterator(
+              github.rest.pulls.list,
+              {
+                owner: "pytorch",
+                repo: "executorch",
+                state: "open",
+                sort: "created",
+                direction: "asc",
+                per_page: 100,
+              }
+            )) {
+              numAPIRequests += 1;
+              const pulls = response.data;
+              // Awaiting in a loop is intentional here. We want to serialize execution so
+              // that log groups are printed correctl
+              for (const pull of pulls) {
+                if (numAPIRequests > MAX_API_REQUESTS) {
+                  core.warning("Max API requests exceeded, exiting.");
+                  process.exit(0);
+                }
+                await core.group(`Processing PR #${pull.number}`, async () => {
+                  await processPull(pull);
+                });
+              }
+            }
+            core.info(`Processed ${numProcessed} PRs total.`);
@@ -249,6 +249,21 @@
     - arg_meta: null
       kernel_name: impl::reference::quantized_relu_asym8u_asym8u_per_tensor_out
 
+- func: cadence::quantized_add.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_add_per_tensor_out
+
+- func: cadence::quantized_add_asym8sxasym8s_asym8s.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_add_asym8sxasym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_add_asym8uxasym8u_asym8u.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_add_asym8uxasym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
 
@@ -404,6 +404,16 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_relu_asym8u_asym8u_per_tensor_out
 
+- func: cadence::quantized_add_asym8sxasym8s_asym8s.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_add_asym8sxasym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_add_asym8uxasym8u_asym8u.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_add_asym8uxasym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
 
@@ -116,6 +116,9 @@ def plan_spec(
         Greedily place the spec in the first memory that can fit it.
         """
         for spec.mem_id in range(1, self.get_num_memories()):
+            if placement_constraints.is_mem_id_in_blocklist(spec, spec.mem_id):
+                # Skip placement for blocked memory id.
+                continue
             prev_offset, smallest_gap = 0, float("inf")
             for allocated_spec in state.allocated_buffers[spec.mem_id]:
                 if not Verifier.lifetime_overlap(spec, allocated_spec):
@@ -141,11 +144,11 @@ def plan_spec(
                 )
             if spec.mem_offset is None:
                 spec.mem_offset = prev_offset
-                if not self.is_valid_placement(spec, placement_constraints):
-                    spec.mem_offset = None
-                    continue
-                else:
-                    spec.mem_offset = prev_offset
+
+            if not self.is_valid_placement(spec, placement_constraints):
+                # Skip placement for invalid memory id.
+                spec.mem_offset = None
+                continue
 
             state.place_spec(spec)
             # A data structure used for maintaining the tensor order
 
@@ -204,7 +204,7 @@ def _place_memory_id_pinned_specs(
                 for spec, c in spec_with_abs_constraint.items()
                 if c is not None and c.pinned_memory_id == mem_id and c.offset is None
             }
-            logging.error(f"Placing specs {mem_id_pinned_specs} for {mem_id=}")
+            logging.debug(f"Placing specs {mem_id_pinned_specs} for {mem_id=}")
 
             with self.block_memories_except(mem_id):
                 self.plan(
@@ -220,7 +220,7 @@ def _place_memory_id_pinned_specs(
             if constraint is None:
                 continue
 
-            logging.error(f"Placing spec {spec} with {constraint}")
+            logging.debug(f"Placing spec {spec} with {constraint}")
 
             if not state.is_placed(spec):
                 raise MemoryError(
 
@@ -325,6 +325,22 @@
     "quantized_add.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, "
     "int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_add_asym8sxasym8s_asym8s.per_tensor(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, "
+    "int Y_zero_point, float out_scale, int out_zero_point) -> Tensor"
+)
+lib.define(
+    "quantized_add_asym8sxasym8s_asym8s.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, "
+    "int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_add_asym8uxasym8u_asym8u.per_tensor(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, "
+    "int Y_zero_point, float out_scale, int out_zero_point) -> Tensor"
+)
+lib.define(
+    "quantized_add_asym8uxasym8u_asym8u.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, "
+    "int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "quantized_mul.out(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
@@ -503,6 +519,36 @@ def quantized_add_per_tensor_meta(
     return X.new_empty(out_size, dtype=X.dtype)
 
 
+@register_fake("cadence::quantized_add_asym8sxasym8s_asym8s.per_tensor")
+def quantized_add_asym8sxasym8s_asym8s_per_tensor_meta(
+    X: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_scale: float,
+    Y_zero_point: int,
+    out_scale: float,
+    out_zero_point: int,
+) -> torch.Tensor:
+    out_size = torch.broadcast_shapes(X.size(), Y.size())
+    return X.new_empty(out_size, dtype=X.dtype)
+
+
+@register_fake("cadence::quantized_add_asym8uxasym8u_asym8u.per_tensor")
+def quantized_add_asym8uxasym8u_asym8u_per_tensor_meta(
+    X: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_scale: float,
+    Y_zero_point: int,
+    out_scale: float,
+    out_zero_point: int,
+) -> torch.Tensor:
+    out_size = torch.broadcast_shapes(X.size(), Y.size())
+    return X.new_empty(out_size, dtype=X.dtype)
+
+
 @register_fake("cadence::quantized_linear")
 def quantized_linear_meta(
     src: torch.Tensor,
 
@@ -1044,7 +1044,7 @@ class DummyMemIdBlockConstraintGen(PassBase):
             mul: blocks 1, 3
             """
 
-            def __init__(self, memory_constraints: MemoryConfig):
+            def __init__(self, memory_constraints: MemConstraints):
                 self.memory_constraints = memory_constraints
 
             def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
@@ -445,3 +445,53 @@ def test_uint8_dispatch_quantized_conv_nhwc_dilated(self) -> None:
             ),
             1,
         )
+
+    def test_int8_dispatch_quantized_add(self) -> None:
+        """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_add"""
+        x = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
+        y = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
+        gm = single_op_builder(
+            placeholders=(x, y),
+            op=exir_ops.edge.cadence.quantized_add.per_tensor,
+            args=(x, 1.0, 0, y, 1.0, 0, 1.0, 0),
+        )
+        p = CompileTimeTypeDispatchPass()
+        gm = cast(PassResult, p(gm)).graph_module
+        # Original op should be replaced
+        self.assertEqual(
+            count_node(gm, exir_ops.edge.cadence.quantized_add.per_tensor),
+            0,
+        )
+        # Should be replaced with int8 specific variant
+        self.assertEqual(
+            count_node(
+                gm,
+                exir_ops.edge.cadence.quantized_add_asym8sxasym8s_asym8s.per_tensor,
+            ),
+            1,
+        )
+
+    def test_uint8_dispatch_quantized_add(self) -> None:
+        """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_add"""
+        x = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
+        y = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
+        gm = single_op_builder(
+            placeholders=(x, y),
+            op=exir_ops.edge.cadence.quantized_add.per_tensor,
+            args=(x, 1.0, 0, y, 1.0, 0, 1.0, 0),
+        )
+        p = CompileTimeTypeDispatchPass()
+        gm = cast(PassResult, p(gm)).graph_module
+        # Original op should be replaced
+        self.assertEqual(
+            count_node(gm, exir_ops.edge.cadence.quantized_add.per_tensor),
+            0,
+        )
+        # Should be replaced with uint8 specific variant
+        self.assertEqual(
+            count_node(
+                gm,
+                exir_ops.edge.cadence.quantized_add_asym8uxasym8u_asym8u.per_tensor,
+            ),
+            1,
+        )
@@ -85,6 +85,14 @@ class CompileTimeTypeDispatchPass(ExportPass):
                 (torch.uint8,): "asym8u_asym8u",
             },
         ),
+        exir_ops.edge.cadence.quantized_add.per_tensor: OpConfig(
+            "quantized_add",
+            type_dispatch_suffixes={
+                (torch.int8, torch.int8): "asym8sxasym8s_asym8s",
+                (torch.uint8, torch.uint8): "asym8uxasym8u_asym8u",
+            },
+            weight_arg_idx=3,
+        ),
     }
 
     def call_operator(
 
@@ -45,6 +45,7 @@ bool is_out_of_bounds(CTYPE_VAL val) {
 }
 
 ET_NODISCARD bool check_bounds(
+    KernelRuntimeContext& ctx,
     const Scalar& val_scalar,
     const ScalarType& val_type,
     const ScalarType& out_type,
@@ -107,14 +108,14 @@ Tensor& clamp_out(
   if (has_min) {
     ET_KERNEL_CHECK(
         ctx,
-        check_bounds(min_opt.value(), min_type, out_type, "minimum"),
+        check_bounds(ctx, min_opt.value(), min_type, out_type, "minimum"),
         InvalidArgument,
         out);
   }
   if (has_max) {
     ET_KERNEL_CHECK(
         ctx,
-        check_bounds(max_opt.value(), max_type, out_type, "maximum"),
+        check_bounds(ctx, max_opt.value(), max_type, out_type, "maximum"),
         InvalidArgument,
         out);
   }