pytorch
diff --git a/‎.github/scripts/extract_benchmark_results.py‎
Lines changed: 8 additions & 2 deletions b/‎.github/scripts/extract_benchmark_results.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 5 additions & 25 deletions b/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 5 additions & 25 deletions
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 20 additions & 0 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.cpp‎
Lines changed: 5 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.h‎
Lines changed: 9 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl‎
Lines changed: 122 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.yaml‎
Lines changed: 28 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.yaml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl‎
Lines changed: 18 additions & 16 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl‎
Lines changed: 18 additions & 16 deletions
@@ -341,6 +341,7 @@ def transform(
     benchmark_results: List,
     benchmark_config: Dict[str, str],
     job_name: str,
+    job_report: Any = {},
 ) -> List:
     """
     Transform the benchmark results into the format writable into the benchmark database
@@ -361,6 +362,7 @@ def transform(
                     # Just keep a copy of the benchmark config here
                     "benchmark_config": json.dumps(benchmark_config),
                     "job_conclusion": "SUCCESS",
+                    "job_arn": job_report.get("arn", ""),
                 },
             },
             "model": {
@@ -446,6 +448,7 @@ def transform_failure_record(
                 "app_type": app_type,
                 "job_conclusion": result,
                 "failure_type": level,
+                "job_arn": report.get("arn", ""),
                 "job_report": json.dumps(report),
             },
         },
@@ -512,6 +515,7 @@ def get_benchmark_config(
 def extract_benchmark_result_from_artifact(
     artifact: Dict[str, Any],
     benchmark_config: Dict[str, str],
+    job_report: Any,
 ) -> List[Any]:
     job_name = artifact.get("job_name", "")
     artifact_type = artifact.get("type", "")
@@ -532,7 +536,9 @@ def extract_benchmark_result_from_artifact(
         )
     if not benchmark_results:
         return []
-    return transform(app_type, benchmark_results, benchmark_config, job_name)
+    return transform(
+        app_type, benchmark_results, benchmark_config, job_name, job_report
+    )
 
 
 def get_app_type(type: str):
@@ -674,7 +680,7 @@ def process_benchmark_results(content: Any, app: str, benchmark_configs: str):
             for job_artifact in job_artifacts:
                 # generate result for each schema
                 results = extract_benchmark_result_from_artifact(
-                    job_artifact, benchmark_config
+                    job_artifact, benchmark_config, job_report
                 )
                 all_benchmark_results.extend(results)
     return all_benchmark_results
 
@@ -526,34 +526,14 @@ class FuseCascadedViewOps(ExportPass):
     Fuse a cascaded chain of view ops
     """
 
-    # Find a chain of view ops, and fuse them into a single permute op.
-
     def fuse_cascaded_view_ops(self, graph_module: torch.fx.GraphModule):
-        graph = graph_module.graph
-        for node in graph.nodes:
-            # We are only interested in view ops
-            if node.target != exir_ops.edge.aten.view_copy.default:
-                continue
-
-            # Get the cascaded chain of view ops starting at node
-            cascaded_view_ops = get_cascaded_ops(
-                [node], [exir_ops.edge.aten.view_copy.default]
-            )
-            # The chain must have more than 1 node
-            if len(cascaded_view_ops) == 1:
+        view_target = exir_ops.edge.aten.view_copy.default
+        for view_node in graph_module.graph.find_nodes(op="call_function", target=view_target, sort=True):
+            input_view = view_node.args[0]
+            if input_view.op != "call_function" or input_view.target != view_target:
                 continue
 
-            last_view_node = cascaded_view_ops[-1]
-            with graph.inserting_before(last_view_node):
-                new_view = graph.call_function(
-                    exir_ops.edge.aten.view_copy.default,
-                    args=(node.args[0], last_view_node.args[1]),
-                )
-                last_view_node.replace_all_uses_with(new_view)
-
-            # Now erase the chain
-            for v in reversed(cascaded_view_ops):
-                graph.erase_node(v)
+            view_node.replace_input_with(input_view, input_view.args[0])
 
         graph_module.recompile()
 
 
@@ -222,6 +222,26 @@ def forward(self, x):
             count_node(graph_module, exir_ops.edge.aten.view_copy.default), 1
         )
 
+    def test_view_fusion_branched(self):
+        class ViewFusion(torch.nn.Module):
+            def forward(self, x):
+                y = x.view([1, 8, 15])
+                z = y.view([1, 1, 120])
+                t = y.view([120, 1, 1])
+                return z, t
+
+        x = torch.randn(8, 5, 3)
+        graph_module = (
+            compiler.export_to_cadence(ViewFusion(), (x,))
+            .exported_program()
+            .graph_module
+        )
+        graph_module.graph.eliminate_dead_code()
+        # z and t should be fused and y should be eliminated.
+        self.assertEqual(
+            count_node(graph_module, exir_ops.edge.aten.view_copy.default), 2
+        )
+
     def test_force_quant_dequant_fusion(self):
         class M(torch.nn.Module):
             def __init__(self):
 
@@ -179,6 +179,11 @@ utils::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
   return utils::kChannelsPacked;
 }
 
+bool ComputeGraph::device_name_contains(const char* substr) {
+  return context_->adapter_ptr()->device_name().find(substr) !=
+      std::string::npos;
+}
+
 void ComputeGraph::check_no_active_value_ptrs() {
   VK_CHECK_COND(
       values_in_use_ == 0,
 
@@ -443,6 +443,15 @@ class ComputeGraph final {
   utils::GPUMemoryLayout suggested_memory_layout(
       const std::vector<int64_t>& sizes);
 
+  inline bool device_is_adreno() {
+    return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO;
+  }
+  const std::string& device_name() {
+    return context()->adapter_ptr()->device_name();
+  }
+
+  bool device_name_contains(const char* substr);
+
   //
   // Graph Building
   //
 
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
+
+#define TILE_ROWS ${TILE_ROWS}
+
+#define NGROUPS 8
+#define NWORKERS 8
+
+${define_required_extensions(DTYPE)}
+
+$if WEIGHT_STORAGE == "buffer":
+  ${define_required_extensions("int8")}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight", "int8", WEIGHT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_scales", DTYPE, SCALES_STORAGE, is_scalar_array=False)}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+  ivec4 weight_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+shared VEC4_T partial_c[NGROUPS][NWORKERS][TILE_ROWS];
+
+void main() {
+  const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
+  const uint out_col = gl_GlobalInvocationID.x << 2;
+
+  const int gid = int(gl_LocalInvocationID.x); // group id
+  const int wid = int(gl_LocalInvocationID.z); // worker id
+
+  if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
+    return;
+  }
+
+  VEC4_T a[TILE_ROWS];
+  VEC4_T b[4];
+  VEC4_T local_c[TILE_ROWS];
+
+  [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
+    local_c[i] = VEC4_T(0.0);
+  }
+
+  $if SCALES_STORAGE == "buffer":
+    const VEC4_T scales = VEC4_T(t_scales[out_col >> 2]);
+  $else:
+    const VEC4_T scales = VEC4_T(texelFetch(t_scales, ivec2(out_col >> 2, 0), 0));
+
+  for (int pos = 4 * wid; pos < in_sizes.x; pos += (4 * NWORKERS)) {
+    // Preload t_weight
+    [[unroll]] for (int i = 0; i < 4; i++) {
+      $if WEIGHT_STORAGE == "buffer":
+        b[i] = t_weight[((pos + i) * weight_sizes.x + out_col) >> 2];
+      $else:
+        b[i] = VEC4_T(texelFetch(t_weight, ivec2(out_col >> 2, pos + i), 0));
+    }
+    // Preload t_in
+    for (int i = 0; i < TILE_ROWS; i++) {
+      $if IN_STORAGE == "buffer":
+        a[i] = t_in[((out_row + i) * in_sizes.x + pos) >> 2];
+      $else:
+        a[i] = VEC4_T(texelFetch(t_in, ivec3(pos >> 2, out_row + i, 0), 0));
+    }
+
+    // Accumulate partial output
+    [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
+        local_c[i] += a[i].x * b[0] +
+                      a[i].y * b[1] +
+                      a[i].z * b[2] +
+                      a[i].w * b[3];
+    }
+  }
+
+  [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
+    partial_c[gid][wid][i] = local_c[i];
+  }
+
+  memoryBarrierShared();
+  barrier();
+
+  if (wid != 0) {
+    return;
+  }
+
+  VEC4_T c[TILE_ROWS];
+
+  for (int row = 0; row < TILE_ROWS; ++row) {
+    c[row] = VEC4_T(0.0);
+    [[unroll]] for (int worker = 0; worker < NWORKERS; ++worker) {
+      c[row] += partial_c[gid][worker][row];
+    }
+  }
+
+  [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
+    $if OUT_STORAGE == "buffer":
+      if (out_row + i < out_sizes.y) {
+        t_out[((out_row + i) * out_sizes.x + out_col) >> 2] = c[i] * scales;
+      }
+    $else:
+      imageStore(t_out, ivec3(out_col >> 2, out_row + i, 0), c[i] * scales);
+  }
+}
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+q_8w_linear_coop:
+  parameter_names_with_default_values:
+    DTYPE: float
+    IN_STORAGE: texture3d
+    OUT_STORAGE: texture3d
+    WEIGHT_STORAGE: texture2d
+    SCALES_STORAGE: texture2d
+    TILE_ROWS: 4
+  generate_variant_forall:
+    TILE_ROWS:
+      - VALUE: 1
+        SUFFIX: o4x1
+  shader_variants:
+    - NAME: q_8w_linear_coop_texture3d_texture3d_texture2d_texture2d_float
+    - NAME: q_8w_linear_coop_buffer_buffer_texture2d_texture2d_float
+      IN_STORAGE: buffer
+      OUT_STORAGE: buffer
+    - NAME: q_8w_linear_coop_buffer_buffer_buffer_buffer_float
+      IN_STORAGE: buffer
+      OUT_STORAGE: buffer
+      WEIGHT_STORAGE: buffer
+      SCALES_STORAGE: buffer
@@ -17,17 +17,17 @@
 
 ${define_required_extensions(DTYPE)}
 
-$if STORAGE == "buffer":
+$if WEIGHT_STORAGE == "buffer":
   ${define_required_extensions("int8")}
 
 #extension GL_EXT_control_flow_attributes : require
 
 layout(std430) buffer;
 
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight", "int8", STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_scales", DTYPE, STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight", "int8", WEIGHT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_scales", DTYPE, SCALES_STORAGE, is_scalar_array=False)}
 
 
 layout(push_constant) uniform restrict Block {
@@ -50,10 +50,10 @@ void main() {
   VEC4_T b[4];
   VEC4_T c[TILE_ROWS];
 
-  $if STORAGE == "buffer":
+  $if SCALES_STORAGE == "buffer":
     const VEC4_T scales = VEC4_T(t_scales[out_col >> 2]);
   $else:
-    const VEC4_T scales = VEC4_T(texelFetch(t_scales, ivec3(out_col >> 2, 0, 0), 0));
+    const VEC4_T scales = VEC4_T(texelFetch(t_scales, ivec2(out_col >> 2, 0), 0));
 
   [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
     c[i] = VEC4_T(0.0);
@@ -62,30 +62,32 @@ void main() {
   for (int pos = 0; pos < in_sizes.x; pos += 4) {
     // Preload weight tensor
     [[unroll]] for (int i = 0; i < 4; i++) {
-      $if STORAGE == "buffer":
-        b[i] = t_weight[((pos + i) * B_sizes.x + out_col) >> 2];
+      $if WEIGHT_STORAGE == "buffer":
+        b[i] = t_weight[((pos + i) * out_sizes.x + out_col) >> 2];
       $else:
-        b[i] = VEC4_T(texelFetch(t_weight, ivec3(out_col >> 2, pos + i, 0), 0));
+        b[i] = VEC4_T(texelFetch(t_weight, ivec2(out_col >> 2, pos + i), 0));
     }
 
     // Preload input tensor
     [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
-      $if STORAGE == "buffer":
-        a[i] = t_in[((out_row + i) * in_sizes.x + (pos)) >> 2];
+      $if IN_STORAGE == "buffer":
+        a[i] = t_in[((out_row + i) * in_sizes.x + pos) >> 2];
       $else:
         a[i] = VEC4_T(texelFetch(t_in, ivec3(pos >> 2, out_row + i, 0), 0));
     }
 
-    // Compute partial output
+    // Accumulate output
     [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
         c[i] += a[i].x * b[0] + a[i].y * b[1] + a[i].z * b[2] + a[i].w * b[3];
     }
   }
 
-  // Store output tensor
+  // Store to output tensor
   [[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
-    $if STORAGE == "buffer":
-      t_out[((out_row + i) * out_sizes.x + out_col) >> 2] = c[i] * scales;
+    $if OUT_STORAGE == "buffer":
+      if (out_row + i < out_sizes.y) {
+        t_out[((out_row + i) * out_sizes.x + out_col) >> 2] = c[i] * scales;
+      }
     $else:
       imageStore(t_out, ivec3(out_col >> 2, out_row + i, 0), c[i] * scales);
   }