Merge branch 'main' into private-s22-devices

huydhn · huydhn · commit 73bd4bc65270 · 2025-04-28T20:05:13.000-07:00
diff --git a/.github/scripts/extract_benchmark_results.py b/.github/scripts/extract_benchmark_results.py
@@ -349,7 +349,10 @@ def transform(
     # Overwrite the device name here with the job name as it has more information about
     # the device, i.e. Samsung Galaxy S22 5G instead of just Samsung
     for r in benchmark_results:
-        r["deviceInfo"]["device"] = job_name
+        is_private_device = job_report.get("is_private_instance", False)
+        r["deviceInfo"]["device"] = (
+            f"{job_name} (private)" if is_private_device else job_name
+        )
 
     # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
     return [
@@ -363,6 +366,7 @@ def transform(
                     "benchmark_config": json.dumps(benchmark_config),
                     "job_conclusion": "SUCCESS",
                     "job_arn": job_report.get("arn", ""),
+                    "instance_arn": job_report.get("instance_arn", ""),
                 },
             },
             "model": {
diff --git a/.github/workflows/apple-perf-private-device-experiment.yml b/.github/workflows/apple-perf-private-device-experiment.yml
@@ -1,18 +1,16 @@
 name: apple-perf (private devices)
 
 on:
-  # TODO (huydhn): Disable the schedule run until we land the change to add device pool and device name
-  # to separate between public and private iOS devices
-  # schedule:
-  # - cron: 0 0,4,8,12,16,20 * * *
+  schedule:
+   - cron: 0 0,4,8,12,16,20 * * *
   pull_request:
     paths:
       - .github/workflows/apple-perf-private-device-experiment.yml
-  # push:
-  #   branches:
-  #     - main
-  #   paths:
-  #     - .github/workflows/apple-perf-private-device-experiment.yml
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/apple-perf-private-device-experiment.yml
   # Note: GitHub has an upper limit of 10 inputs
   workflow_dispatch:
     inputs:
diff --git a/backends/cadence/hifi/operators/op_bmm.cpp b/backends/cadence/hifi/operators/op_bmm.cpp
@@ -16,8 +16,8 @@ using exec_aten::ScalarType;
 using executorch::runtime::KernelRuntimeContext;
 using executorch::runtime::kTensorDimensionLimit;
 using executorch::runtime::resize_tensor;
-using executorch::runtime::tensors_have_same_dim_order;
 using executorch::runtime::tensor_is_default_dim_order;
+using executorch::runtime::tensors_have_same_dim_order;
 using torch::executor::check_bmm_args;
 using torch::executor::Error;
 using torch::executor::get_bmm_out_target_size;
@@ -78,16 +78,16 @@ Tensor& bmm_out(
     WORD32 out_stride = p;
 
     WORD32* __restrict__ tmp =
-          (WORD32* __restrict__)kernels::allocate_temp_memory(
-              ctx, (batch_size * m * p) * sizeof(float));
+        (WORD32* __restrict__)kernels::allocate_temp_memory(
+            ctx, (batch_size * m * p) * sizeof(float));
 
     ET_KERNEL_CHECK(ctx, tmp != nullptr, MemoryAllocationFailed, out);
 
     tmp[batch_size * m * p] = {0};
 
     WORD32* __restrict__ p_o =
-          (WORD32* __restrict__)kernels::allocate_temp_memory(
-              ctx, (batch_size * m * p) * sizeof(WORD32));
+        (WORD32* __restrict__)kernels::allocate_temp_memory(
+            ctx, (batch_size * m * p) * sizeof(WORD32));
 
     ET_KERNEL_CHECK(ctx, p_o != nullptr, MemoryAllocationFailed, out);
 
diff --git a/backends/cadence/hifi/operators/op_mm.cpp b/backends/cadence/hifi/operators/op_mm.cpp
@@ -76,8 +76,8 @@ Tensor& mm_out(
     WORD32 out_stride = p;
 
     WORD32* __restrict__ p_o =
-          (WORD32* __restrict__)kernels::allocate_temp_memory(
-              ctx, (n * p) * sizeof(WORD32));
+        (WORD32* __restrict__)kernels::allocate_temp_memory(
+            ctx, (n * p) * sizeof(WORD32));
 
     WORD32 p_inp_shape[2];
     p_inp_shape[0] = n;
@@ -146,4 +146,4 @@ Tensor& mm_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
-} // namespace cadence
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
@@ -34,6 +34,11 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
 OPERATORS = [
     "add",
     "atan2",
+    "bmm",
+    "mm",
+    "slice_copy",
+    "split_with_sizes_copy",
+    "view_copy",
     "cat",
     "clamp",
     "dequantize_per_tensor",
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl
@@ -38,18 +38,21 @@ layout(push_constant) uniform restrict Block {
   ivec4 weight_sizes;
 };
 
+#include "indexing_utils.h"
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 shared VEC4_T partial_c[NGROUPS][NWORKERS][TILE_ROWS];
 
 void main() {
-  const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
-  const uint out_col = gl_GlobalInvocationID.x << 2;
+  const uint out_width_ntexels = divup4(out_sizes.x);
+  const uint out_col = (gl_GlobalInvocationID.x % out_width_ntexels) << 2;
+  const uint out_row = (gl_GlobalInvocationID.x / out_width_ntexels) * TILE_ROWS;
 
   const int gid = int(gl_LocalInvocationID.x); // group id
   const int wid = int(gl_LocalInvocationID.z); // worker id
 
-  if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
+  if (out_row >= out_sizes.y) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl
@@ -36,13 +36,18 @@ layout(push_constant) uniform restrict Block {
   ivec4 weight_sizes;
 };
 
+#include "indexing_utils.h"
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+
 void main() {
-  const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
-  const uint out_col = gl_GlobalInvocationID.x << 2;
+  const uint16_t out_width_ntexels = uint16_t(divup4(out_sizes.x));
+  const uint16_t out_col = uint16_t((gl_GlobalInvocationID.x % out_width_ntexels) << 2);
+  const uint16_t out_row = uint16_t((gl_GlobalInvocationID.x / out_width_ntexels) * TILE_ROWS);
 
-  if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
+  if (out_row >= uint16_t(out_sizes.y)) {
     return;
   }
 
@@ -51,29 +56,29 @@ void main() {
   VEC4_T c[TILE_ROWS];
 
   $if SCALES_STORAGE == "buffer":
-    const VEC4_T scales = VEC4_T(t_scales[out_col >> 2]);
+    const VEC4_T scales = VEC4_T(t_scales[int(out_col >> 2)]);
   $else:
-    const VEC4_T scales = VEC4_T(texelFetch(t_scales, ivec2(out_col >> 2, 0), 0));
+    const VEC4_T scales = VEC4_T(texelFetch(t_scales, u16vec2(out_col >> 2, 0), 0));
 
   [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
     c[i] = VEC4_T(0.0);
   }
 
-  for (int pos = 0; pos < in_sizes.x; pos += 4) {
+  for (uint16_t pos = uint16_t(0); pos < uint16_t(in_sizes.x); pos += uint16_t(4)) {
     // Preload weight tensor
     [[unroll]] for (int i = 0; i < 4; i++) {
       $if WEIGHT_STORAGE == "buffer":
         b[i] = t_weight[((pos + i) * out_sizes.x + out_col) >> 2];
       $else:
-        b[i] = VEC4_T(texelFetch(t_weight, ivec2(out_col >> 2, pos + i), 0));
+        b[i] = VEC4_T(texelFetch(t_weight, u16vec2(out_col >> 2, pos + i), 0));
     }
 
     // Preload input tensor
     [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
       $if IN_STORAGE == "buffer":
         a[i] = t_in[((out_row + i) * in_sizes.x + pos) >> 2];
       $else:
-        a[i] = VEC4_T(texelFetch(t_in, ivec3(pos >> 2, out_row + i, 0), 0));
+        a[i] = VEC4_T(texelFetch(t_in, u16vec3(pos >> 2, out_row + i, 0), 0));
     }
 
     // Accumulate output
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.yaml
@@ -16,10 +16,10 @@ q_8w_linear_tiled:
     TILE_ROWS:
       - VALUE: 1
         SUFFIX: o4x1
+      - VALUE: 2
+        SUFFIX: o4x2
       - VALUE: 4
         SUFFIX: o4x4
-      - VALUE: 6
-        SUFFIX: o4x6
   shader_variants:
     - NAME: q_8w_linear_tiled_texture3d_texture3d_texture2d_texture2d_float
     - NAME: q_8w_linear_tiled_buffer_buffer_texture2d_texture2d_float
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
@@ -180,10 +180,10 @@ void add_q_8w_linear_tiled_node(
 
   std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
   const int64_t M = utils::val_at(-2, mat1_sizes);
-  int out_tile_nrows = 4;
+  uint32_t out_tile_nrows = 4;
   if (M % 6 == 0) {
-    kernel_name += "_o4x6";
-    out_tile_nrows = 6;
+    kernel_name += "_o4x2";
+    out_tile_nrows = 2;
   } else if (M % 4 == 0) {
     kernel_name += "_o4x4";
     out_tile_nrows = 4;
@@ -195,8 +195,11 @@ void add_q_8w_linear_tiled_node(
     out_tile_nrows = 4;
   }
 
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  global_wg_size[1] = global_wg_size[1] / out_tile_nrows;
+  utils::uvec3 out_limits = graph.logical_limits_of(out);
+  utils::uvec3 global_wg_size = {
+      out_limits[0] * (utils::div_up(out_limits[1], out_tile_nrows)),
+      1,
+      out_limits[2]};
 
   utils::uvec3 local_wg_size{64, 1, 1};
   if (use_coop_algorithm) {
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
@@ -102,3 +102,6 @@ class ExecutorchBackendConfig:
     # serialized in the PTE file. Its value is ignored if mutable buffers are not
     # memory planned as the names must be serialized in that case.
     emit_mutable_buffer_names: bool = False
+
+    # If set to true, we run quant fusion and constant propagation passes
+    do_quant_fusion_and_const_prop: bool = False
diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS
@@ -154,6 +154,8 @@ python_library(
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
+        "//pytorch/ao:torchao",
+        "//executorch/exir/passes:constant_prop_pass",
     ],
 )
 
diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
@@ -6,6 +6,7 @@
 
 # pyre-unsafe
 
+import logging
 from collections import OrderedDict
 from typing import cast, Mapping, Optional
 
@@ -29,6 +30,32 @@
 # Propagating aten.full can significantly increase compiled model size.
 _DEFAULT_SKIP_TARGETS = {exir_ops.edge.aten.full.default}
 
+# Do not const prop quantization primitives
+_QDQ_OPS = [
+    exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+    exir_ops.edge.quantized_decomposed.convert_element_type.no_fuse,
+    exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+    exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+    exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+    exir_ops.edge.quantized_decomposed.choose_qparams.tensor,
+]
+try:
+    import torchao  # noqa: F401
+
+    _QDQ_OPS.extend(
+        [
+            exir_ops.edge.torchao.dequantize_affine.default,
+            exir_ops.edge.torchao.quantize_affine.default,
+            exir_ops.edge.torchao.choose_qparams_affine.default,
+        ]
+    )
+except ImportError:
+    pass
+_DEFAULT_SKIP_TARGETS.update(set(_QDQ_OPS))
+
+
 _PRIMITIVE_TYPES = (
     float,
     int,
@@ -308,7 +335,9 @@ def constant_prop_pass(
         if node.target == torch.ops.higher_order.cond
     ]
     if len(has_control_flow) > 0:
-        raise RuntimeError("constant_prop_pass for control flow is not supported yet.")
+        logging.warning(
+            "constant_prop_pass does not constant propagate in control flow modules"
+        )
 
     const_node_to_tensor = get_propagated_const_tensor_dict(
         exported_program, custom_skip_targets
diff --git a/exir/passes/quant_fusion_pass.py b/exir/passes/quant_fusion_pass.py
@@ -7,6 +7,8 @@
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
+from executorch.exir.passes.constant_prop_pass import constant_prop_pass
+from torch.export import ExportedProgram
 from torch.fx import GraphModule, subgraph_rewriter
 from torch.fx.passes.infra.pass_base import PassResult
 from torch.utils import _pytree as pytree
@@ -139,3 +141,13 @@ def call(self, graph_module: GraphModule) -> PassResult:
         graph_module.graph.lint()
         graph_module.graph.eliminate_dead_code()
         return PassResult(graph_module, True)
+
+
+def quant_fusion_and_const_prop_pass(program: ExportedProgram) -> ExportedProgram:
+    gm = program.graph_module
+    gm_res = QuantFusionPass(_fix_node_meta_val=True)(gm)
+    gm = gm_res.graph_module
+
+    # Do const prop pass to remove packing/dtype conversion ops
+    program = constant_prop_pass(program)
+    return program
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -52,6 +52,7 @@
 from executorch.exir.passes.normalize_view_copy_base_pass import (
     NormalizeViewCopyBasePass,
 )
+from executorch.exir.passes.quant_fusion_pass import quant_fusion_and_const_prop_pass
 from executorch.exir.passes.remove_graph_asserts_pass import (
     RemoveGraphAssertsPass,
     RemoveNonCoreAtenOpGraphAssertsPass,
@@ -1524,9 +1525,15 @@ def to_executorch(
             after it has been transformed to the ExecuTorch backend.
         """
         config = config if config else ExecutorchBackendConfig()
-
         execution_programs: Dict[str, ExportedProgram] = {}
         for name, program in self._edge_programs.items():
+            if config.do_quant_fusion_and_const_prop:
+                if program.graph_signature.backward_signature is not None:
+                    raise Exception(
+                        "Cannot run do_quant_fusion_and_const_prop on a graph with a backward signature intended for on-device training."
+                        " Please set do_quant_fusion_and_const_prop to False in the ExecutorchBackendConfig."
+                    )
+                program = quant_fusion_and_const_prop_pass(program)
             program = weights_to_outputs_pass(program)
             program = unsafe_remove_auto_functionalized_pass(program)
             gm, new_signature = insert_write_back_for_buffers_pass(program)
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
diff --git a/exir/tests/test_quant_fusion_pass.py b/exir/tests/test_quant_fusion_pass.py
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
diff --git a/extension/pytree/targets.bzl b/extension/pytree/targets.bzl