pytorch
diff --git a/‎backends/vulkan/op_registry.py‎
Lines changed: 10 additions & 0 deletions b/‎backends/vulkan/op_registry.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/reduce_op_defs.glslh‎
Lines changed: 94 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/reduce_op_defs.glslh‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.glsl‎
Lines changed: 122 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.glsl‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.yaml‎
Lines changed: 42 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/reduce_per_row_buffer.yaml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/ArgReduce.cpp‎
Lines changed: 56 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/impl/ArgReduce.cpp‎
Lines changed: 56 additions & 0 deletions
@@ -449,6 +449,7 @@ def try_find_keepdim_arg(node: torch.fx.Node) -> bool:
             return False
 
         keepdim = try_find_keepdim_arg(node)
+        # keepdim = False is not supported yet
         if isinstance(keepdim, bool) and not keepdim:
             return False
 
@@ -461,6 +462,15 @@ def pick_io_storage_for_reduce(node: torch.fx.Node):
         input_tensor = node.args[0]
         ndim = input_tensor.meta["val"].ndim
         dim_list = node.args[1]
+
+        # For 1D reductions, a special case is implemented for reducing the width dim
+        if isinstance(dim_list, list) and len(dim_list) == 1:
+            if dim_list[0] == -1:
+                inputs_storage = utils.ANY_TEXTURE.make_union(utils.CONTIGUOUS_BUFFER)
+                outputs_storage = inputs_storage
+                return inputs_storage, outputs_storage
+
+        # For 2D reductions, the packed dimension cannot be one of the reduced dims
         if isinstance(dim_list, list) and len(dim_list) == 2:
             reduce_dim1_whcn = utils.nchw_dim_to_whcn_dim(dim_list[0], ndim)
             reduce_dim2_whcn = utils.nchw_dim_to_whcn_dim(dim_list[1], ndim)
 
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef REDUCE_OP_DEFS_GLSLH
+#define REDUCE_OP_DEFS_GLSLH
+
+struct Accum {
+  T val;
+  uint idx;
+  uint count;
+};
+
+void init_accum(out Accum accum, T val, uint idx) {
+  accum.val = val;
+  accum.idx = idx;
+  accum.count = 1;
+}
+
+void init_accum_zero(out Accum accum) {
+  accum.val = T(0);
+  accum.idx = 0;
+  accum.count = 0;
+}
+
+// Sum / Mean
+
+void update_accum_sum(inout Accum accum, T val, uint idx) {
+  accum.val += val;
+  accum.count += 1;
+}
+
+void merge_accum_sum(inout Accum accum, const Accum other) {
+  accum.val += other.val;
+  accum.count += other.count;
+}
+
+void postprocess_accum_mean(inout Accum accum) {
+  accum.val /= T(accum.count);
+}
+
+// Amax (maximum value)
+
+void update_accum_amax(inout Accum accum, T val, uint idx) {
+  if (val > accum.val) {
+    accum.val = val;
+    accum.idx = idx;
+  }
+  // For equivalence, select the lower index
+  if (val == accum.val && idx < accum.idx) {
+    accum.idx = idx;
+  }
+}
+
+void merge_accum_amax(inout Accum accum, const Accum other) {
+  if (other.val > accum.val) {
+    accum.val = other.val;
+    accum.idx = other.idx;
+  }
+  // For equivalence, select the lower index
+  if (other.val == accum.val && other.idx < accum.idx) {
+    accum.idx = other.idx;
+  }
+}
+
+// Amin (minimum value)
+
+void update_accum_amin(inout Accum accum, T val, uint idx) {
+  if (val < accum.val) {
+    accum.val = val;
+    accum.idx = idx;
+  }
+  // For equivalence, select the lower index
+  if (val == accum.val && idx < accum.idx) {
+    accum.idx = idx;
+  }
+}
+
+void merge_accum_amin(inout Accum accum, const Accum other) {
+  if (other.count > 0 && (accum.count == 0 || other.val < accum.val)) {
+    accum.val = other.val;
+    accum.idx = other.idx;
+  }
+  // For equivalence, select the lower index
+  if (other.val == accum.val && other.idx < accum.idx) {
+    accum.idx = other.idx;
+  }
+}
+
+#endif // REDUCE_OP_DEFS_GLSLH
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${texel_load_component_type(DTYPE, "buffer")}
+
+#define NUM_OUTPUTS_PER_WG 1
+#define NUM_WORKERS_PER_OUTPUT 64
+
+${define_active_storage_type("buffer")}
+${define_required_extensions(DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+#include "reduce_op_defs.glslh"
+
+$if OUTPUT_IS_INDICES:
+  ${layout_declare_tensor(B, "w", "t_out", "int", "buffer")}
+$else:
+  ${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Shared memory for cooperative reduction
+shared Accum shared_values[NUM_OUTPUTS_PER_WG][NUM_WORKERS_PER_OUTPUT];
+
+#define init_fn ${INIT_ACCUM_FN}
+#define update_fn ${UPDATE_ACCUM_FN}
+#define merge_fn ${MERGE_ACCUM_FN}
+
+$if POSTPROCESS_ACCUM_FN != "none":
+  #define postprocess_fn ${POSTPROCESS_ACCUM_FN}
+
+$if OOB_INIT_MODE == "zero":
+  #define OOB_INIT_MODE 0
+$else:
+  #define OOB_INIT_MODE 1
+
+$if OUTPUT_IS_INDICES:
+  #define OUTPUT_IS_INDICES
+
+#extension GL_EXT_debug_printf : require
+
+void main() {
+  const uint out_bufi = gl_GlobalInvocationID.y;
+
+  if (out_of_bounds(out_bufi, outp)) {
+    return;
+  }
+
+  // Local indices
+  const uint worker_id = gl_LocalInvocationID.x;
+  const uint output_id = gl_LocalInvocationID.y;
+
+  const uint in_bufi_base = out_bufi * width(inp);
+
+  Accum local_accum;
+  // Initialize accumulator with the first element being processed
+  if (worker_id < width(inp)) {
+    const uint in_bufi = in_bufi_base + worker_id;
+    init_fn(local_accum, t_in[in_bufi], worker_id);
+  }
+  // For out of bounds case, initialization depends on reduction op
+  else {
+#if OOB_INIT_MODE == 0
+    // Init with a zero value
+    init_accum_zero(local_accum);
+#else
+    // Init with the first value (i.e. amin, amax)
+    init_fn(local_accum, t_in[in_bufi_base], 0);
+#endif
+  }
+
+  for (uint x = worker_id + NUM_WORKERS_PER_OUTPUT; x < width(inp);
+       x += NUM_WORKERS_PER_OUTPUT) {
+    update_fn(local_accum, t_in[in_bufi_base + x], x);
+  }
+
+  shared_values[output_id][worker_id] = local_accum;
+
+  memoryBarrierShared();
+  barrier();
+
+  for (int i = NUM_WORKERS_PER_OUTPUT / 2; i > 0; i >>= 1) {
+    if (worker_id < i) {
+      merge_fn(
+        shared_values[output_id][worker_id],
+        shared_values[output_id][worker_id + i]);
+    }
+    memoryBarrierShared();
+    barrier();
+  }
+
+  if (worker_id == 0) {
+    local_accum = shared_values[output_id][0];
+#ifdef postprocess_fn
+    postprocess_fn(local_accum);
+#endif
+
+#ifdef OUTPUT_IS_INDICES
+    t_out[out_bufi] = int(0); // int(local_accum.idx);
+#else
+    t_out[out_bufi] = local_accum.val;
+#endif
+  }
+}
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+reduce_per_row_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    INIT_ACCUM_FN: init_accum
+    UPDATE_ACCUM_FN: update_accum_sum
+    MERGE_ACCUM_FN: merge_accum_sum
+    POSTPROCESS_ACCUM_FN: none
+    OOB_INIT_MODE: zero
+    OUTPUT_IS_INDICES: false
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+      - VALUE: int32
+  shader_variants:
+    - NAME: sum_per_row_buffer
+    - NAME: mean_per_row_buffer
+      POSTPROCESS_ACCUM_FN: postprocess_accum_mean
+    - NAME: amax_per_row_buffer
+      UPDATE_ACCUM_FN: update_accum_amax
+      MERGE_ACCUM_FN: merge_accum_amax
+      OOB_INIT_MODE: first_element
+    - NAME: amin_per_row_buffer
+      UPDATE_ACCUM_FN: update_accum_amin
+      MERGE_ACCUM_FN: merge_accum_amin
+      OOB_INIT_MODE: first_element
+    - NAME: argmax_per_row_buffer
+      UPDATE_ACCUM_FN: update_accum_amax
+      MERGE_ACCUM_FN: merge_accum_amax
+      OOB_INIT_MODE: first_element
+      OUTPUT_IS_INDICES: true
+    - NAME: argmin_per_row_buffer
+      UPDATE_ACCUM_FN: update_accum_amin
+      MERGE_ACCUM_FN: merge_accum_amin
+      OOB_INIT_MODE: first_element
+      OUTPUT_IS_INDICES: true
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Reduce.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+namespace vkcompute {
+
+void arg_reduce_impl(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args,
+    const std::string& op_name) {
+  int arg_idx = 0;
+  const ValueRef in = args.at(arg_idx++);
+  const ValueRef dim = args.at(arg_idx++);
+  const ValueRef keepdim = args.at(arg_idx++);
+  const ValueRef out = args.at(arg_idx++);
+
+  VK_CHECK_COND(graph.is_buffer_storage(in));
+
+  int64_t dim_val = 0;
+  if (graph.val_is_not_none(dim)) {
+    dim_val = graph.extract_scalar<int64_t>(dim);
+  }
+  const int64_t ndim = graph.dim_of(in);
+  const int64_t normalized_dim = normalize(dim_val, graph.dim_of(in));
+
+  VK_CHECK_COND(normalized_dim == ndim - 1);
+
+  // Use the reduce_per_row_node function
+  add_reduce_per_row_node(graph, in, out, op_name);
+}
+
+void argmin(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  arg_reduce_impl(graph, args, "argmin");
+}
+
+void argmax(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  arg_reduce_impl(graph, args, "argmax");
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.argmin.default, argmin);
+  VK_REGISTER_OP(aten.argmax.default, argmax);
+}
+
+} // namespace vkcompute