pytorch
diff --git a/‎backends/vulkan/op_registry.py‎
Lines changed: 13 additions & 9 deletions b/‎backends/vulkan/op_registry.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.cpp‎
Lines changed: 8 additions & 1 deletion b/‎backends/vulkan/runtime/api/containers/Tensor.cpp‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.cpp‎
Lines changed: 32 additions & 2 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.cpp‎
Lines changed: 32 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.h‎
Lines changed: 8 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/GraphConfig.h‎
Lines changed: 4 additions & 0 deletions b/‎backends/vulkan/runtime/graph/GraphConfig.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/ExecuteNode.h‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ops/ExecuteNode.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl‎
Lines changed: 44 additions & 17 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl‎
Lines changed: 44 additions & 17 deletions
@@ -435,7 +435,19 @@ def register_2d_pool_op():
 )
 def register_convolution_op():
     return OpFeatures(
-        inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
+        inputs_storage=[
+            utils.CHANNELS_PACKED_TEXTURE,  # input
+            utils.NO_STORAGE,  # weight (prepacked)
+            utils.NO_STORAGE,  # bias (prepacked)
+            utils.NO_STORAGE,  # stride (non tensor)
+            utils.NO_STORAGE,  # padding (non tensor)
+            utils.NO_STORAGE,  # dilation (non tensor)
+            utils.NO_STORAGE,  # transposed (non tensor)
+            utils.NO_STORAGE,  # output_padding (non tensor)
+            utils.NO_STORAGE,  # groups (non tensor)
+            utils.NO_STORAGE,  # output_min (non tensor)
+            utils.NO_STORAGE,  # output_max (non tensor)
+        ],
         supports_resize=True,
         supports_prepacking=True,
     )
@@ -491,17 +503,9 @@ def register_view_ops():
 # for both texture and buffer storage types.
 @update_features(exir_ops.edge.aten.cat.default)
 def register_cat_op():
-    def check_cat_node(node: torch.fx.Node) -> bool:
-        inputs = node.args[0]
-        if isinstance(inputs, (list, tuple)) and len(inputs) <= 3:
-            return True
-
-        return False
-
     return OpFeatures(
         inputs_storage=utils.ANY_STORAGE,
         supports_resize=True,
-        are_node_inputs_supported_fn=check_cat_node,
     )
 
 
 
@@ -517,6 +517,7 @@ void vTensorStorage::transition(
   vkapi::MemoryAccessFlags prev_access = last_access_.access;
 
   const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
+  const bool cur_written = (cur_access & vkapi::MemoryAccessType::WRITE) != 0;
 
   VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
   VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
@@ -528,7 +529,13 @@ void vTensorStorage::transition(
     layout_changed = cur_layout != new_layout;
   }
 
-  if (prev_written || layout_changed) {
+  // RAW: need to make sure current read sees previous writes
+  // WAW: need to make sure the current write occurs after previous write so
+  //      the final value is correct.
+  // WAR: need to make sure previous read does not read the value from the
+  //      current write.
+  // RAR: no need for synchronization
+  if (prev_written || cur_written || layout_changed) {
     VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage);
     if (0u == src_stage) {
       src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
 
@@ -799,6 +799,33 @@ void ComputeGraph::prepare_pipelines() {
   pipeline_descriptors_ = std::unordered_set<
       vkapi::ComputePipelineCache::Key,
       vkapi::ComputePipelineCache::Hasher>();
+
+  const size_t total_node_count = execute_nodes_.size();
+  size_t init_threshold = config_.execute_initial_threshold_node_count;
+  size_t count_threshold = config_.execute_threshold_node_count;
+
+  // If max command buffer count is set, we need to adjust the thresholds to
+  // accommodate execution within the limit, if total command buffers with
+  // current thresholds would exceed execute_max_cmds
+  if (config_.execute_max_cmds > 0) {
+    // Worse case scenario we have one command buffer for nodes before init
+    // threshold and config_.execute_max_cmds - 1 command buffers for the rest
+    // of dispatches
+
+    // If command buffers created after offsetting init_threshold would exceed
+    // max command buffer count, we need to adjust init and count thresholds
+    const bool slicing_exceeds_max_cmds = (total_node_count - init_threshold) >
+        count_threshold * (config_.execute_max_cmds - 1);
+    if (total_node_count > init_threshold && slicing_exceeds_max_cmds) {
+      // Increase count threshold so remaining nodes after offsetting init fits
+      // in config_.execute_max_cmds - 1
+      count_threshold = static_cast<size_t>(ceil(
+          (total_node_count - init_threshold) /
+          double(config_.execute_max_cmds - 1)));
+    }
+  }
+
+  execute_threshold_node_count_ = count_threshold;
 }
 
 void ComputeGraph::submit_current_cmd(const bool final_use) {
@@ -888,6 +915,7 @@ void ComputeGraph::execute() {
     context_->set_cmd(/*reusable = */ true);
 
     context_->cmd_reset_querypool();
+    const size_t total_node_count = execute_nodes_.size();
     uint32_t encoded_node_count = 0;
 
     for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
@@ -900,11 +928,13 @@ void ComputeGraph::execute() {
       const bool reached_threshold =
           encoded_node_count >= config_.execute_initial_threshold_node_count &&
           ((encoded_node_count - config_.execute_initial_threshold_node_count) %
-               config_.execute_threshold_node_count ==
+               execute_threshold_node_count_ ==
            0);
 
       // Create a new command buffer when threashold is reached
-      if (reached_threshold) {
+      // But avoid it if this is the last node, since last cmd buf is submitted
+      // after the loop
+      if (reached_threshold && encoded_node_count != total_node_count) {
         context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false);
         deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
         context_->set_cmd(true);
 
@@ -207,6 +207,14 @@ class ComputeGraph final {
   // current Context's command buffer is submitted now.
   size_t staging_nbytes_in_cmd_ = 0;
 
+  // Represents the nodes to wait before submitting commands.
+  // If command buffers created with config.execute_threshold_node_count exceeds
+  // config.execute_max_cmds, then execute_threshold_node_count will be
+  // increased to fit command buffers within the limit. Otherwise,
+  // execute_threshold_node_count will be set to
+  // config.execute_threshold_node_count.
+  size_t execute_threshold_node_count_ = 0;
+
  public:
   //
   // Accessors
 
@@ -61,6 +61,10 @@ struct GraphConfig final {
   // by taking more advantage of parallelism between the CPU and GPU.
   size_t execute_initial_threshold_node_count = 0;
 
+  // If this number is greater than 0 then, during execute create at most this
+  // many command buffers.
+  size_t execute_max_cmds = 0;
+
   vkapi::Adapter* external_adapter;
 
   // Generate a default graph config with pre-configured settings
 
@@ -43,7 +43,7 @@ class ExecuteNode {
   friend class ComputeGraph;
 
  public:
-  using ResizeFunction = const std::function<void(
+  using ResizeFunction = std::function<void(
       ComputeGraph*,
       const std::vector<ArgGroup>&,
       const std::vector<ValueRef>&)>;
 
@@ -20,19 +20,21 @@ layout(std430) buffer;
 
 #include "indexing_utils.h"
 
-${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "rw", "t_out", DTYPE, "buffer")}
 
 $for i in range(NUM_INPUTS):
-  ${layout_declare_tensor(B, "r", "t_in" + str(i + 1), DTYPE, "buffer")}
+  ${layout_declare_tensor(B, "r", "t_inp" + str(i), DTYPE, "buffer")}
+
+${layout_declare_tensor(B, "r", "t_concat_offset", "int", "buffer")}
 
 ${layout_declare_ubo(B, "int", "concat_dim")}
 
 ${layout_declare_ubo(B, "ivec4", "out_sizes")}
 ${layout_declare_ubo(B, "ivec4", "out_strides")}
 
 $for i in range(NUM_INPUTS):
-  ${layout_declare_ubo(B, "ivec4", "in" + str(i+1) + "_sizes")}
-  ${layout_declare_ubo(B, "ivec4", "in" + str(i+1) + "_strides")}
+  ${layout_declare_ubo(B, "ivec4", "inp" + str(i) + "_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "inp" + str(i) + "_strides")}
 
 ${layout_declare_ubo(B, "int", "out_numel")}
 
@@ -42,28 +44,53 @@ const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+#define NUM_INPUTS ${NUM_INPUTS}
+
+#include "concat_utils.glslh"
+
+/*
+ * This shader template concatenates up to NUM_INPUT input tensors to the
+ * output tensor along the concat_dim. Elements from the input tensor will
+ * be inserted along the output's concat_dim starting at concat_offset.
+ */
 void main() {
-  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
-  if (out_bufi >= out_numel) {
+  const int tid = ivec3(gl_GlobalInvocationID).x;
+
+  // The 1-3 input tensors are interpreted as one concatenated tensor ("volume")
+  // along the concat_dim for the purposes of tensor indexing. Each thread is
+  // responsible for reading one item from this volume and writing it to the
+  // appropriate output location.
+  ivec4 inp_volume_sizes = out_sizes;
+  inp_volume_sizes[concat_dim] = total_concat_dim_numel();
+
+  // Account for 0 size input tensors
+  if (any(lessThanEqual(inp_volume_sizes, ivec4(0)))) {
+    return;
+  }
+
+  ivec4 inp_volume_tidx = nchwi_to_tidx(tid, inp_volume_sizes);
+
+  // bounds check
+  if (any(greaterThanEqual(inp_volume_tidx, inp_volume_sizes))) {
     return;
   }
 
-  // Convert buffer linear index to 4-D tensor index for output
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
+  int concat_offset = t_concat_offset[0];
+
+  ivec4 out_tidx = inp_volume_tidx;
+  out_tidx[concat_dim] += concat_offset;
 
-  // Determine which input tensor to read from
-  ivec4 in_tidx = out_tidx;
+  const uint out_bufi = tidx_to_bufi(out_tidx, out_strides);
 
+  // Go through the list of input tensors, and find which input this output
+  // element should be read from.
   $for i in range(NUM_INPUTS):
-    // Check if the index at the concat dim is within bounds of the input tensor
-    // If so, read from that input tensor and write to output
-    if (in_tidx[concat_dim] < in${i+1}_sizes[concat_dim]) {
-      int in_bufi = tidx_to_bufi(in_tidx, in${i+1}_strides);
-      t_out[out_bufi] = t_in${i+1}[in_bufi];
+    if (inp_volume_tidx[concat_dim] < inp${i}_sizes[concat_dim]) {
+      int inp_bufi = tidx_to_bufi(inp_volume_tidx, inp${i}_strides);
+      t_out[out_bufi] = t_inp${i}[inp_bufi];
       return;
     }
-    // otherwise, decrement the index at the concat dim
     else {
-      in_tidx[concat_dim] -= in${i+1}_sizes[concat_dim];
+      inp_volume_tidx[concat_dim] -= inp${i}_sizes[concat_dim];
     }
 }