[ET-VK] Allow clone op to transfer between memory layouts and storage types (pytorch#6607)

pytorchbot · SS-JIA · web-flow · commit f0af4660e4c7 · 2024-10-31T23:00:20.000-04:00
Pull Request resolved: pytorch#6596 ## Changes As title. Extend the functionality of the `aten.clone` operator to allow transitioning the storage type and memory layout between the input to the output tensor. ## Context This functionality will be used to transition input tensors to the optimal storage type and memory layout before entering the execution of an op. The transition nodes will be added by a memory metadata tagging pass that will be introduced in a subsequent diff. ghstack-source-id: 251229412 @exported-using-ghexport Differential Revision: [D65277710](https://our.internmc.facebook.com/intern/diff/D65277710/) Co-authored-by: Stephen Jia <ssjia@meta.com>
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -612,6 +612,22 @@ class ComputeGraph final {
     return {t, staging};
   }
 
+  /*
+   * Add an input tensor with the specified properties along with its staging
+   * buffer.
+   */
+  inline IOValueRef add_input_tensor(
+      const std::vector<int64_t>& sizes,
+      const vkapi::ScalarType dtype,
+      const utils::StorageType storage_type,
+      const utils::GPUMemoryLayout memory_layout,
+      const int64_t shared_object_idx = -1) {
+    ValueRef t = add_tensor(
+        sizes, dtype, storage_type, memory_layout, shared_object_idx);
+    ValueRef staging = set_input_tensor(t);
+    return {t, staging};
+  }
+
   SharedObject& get_shared_object(const int64_t idx);
 
   //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml
@@ -9,11 +9,11 @@ bitw8_image_to_nchw_nobitw8buffer:
     STORAGE: texture3d
     DTYPE: int8
   generate_variant_forall:
-    DTYPE:
-      - VALUE: int8
-      - VALUE: uint8
     STORAGE:
       - VALUE: texture2d
       - VALUE: texture3d
+    DTYPE:
+      - VALUE: int8
+      - VALUE: uint8
   shader_variants:
     - NAME: bitw8_image_to_nchw_nobitw8buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -19,9 +19,11 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_buffer(B, "w", "nchw_out", DTYPE)}
+${layout_declare_buffer(B, "w", "buf_out", DTYPE)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
+$if not TO_STAGING:
+  ${layout_declare_ubo(B, "ivec4", "buf_strides")}
 
 #include "indexing_utils.h"
 
@@ -31,23 +33,23 @@ ${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 axis_map = unhash_axis_map(t_layout);
 const lowp int packed_dim = unhash_packed_dim(t_layout);
 
-void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
-  const ivec4 buf_indices = tidx_to_nchwi(
-      tensor_idx,
-      sizes,
-      packed_dim);
+void write_out_texel(VEC4_T texel, ivec4 tidx) {
+  $if TO_STAGING:
+    const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim);
+  $else:
+    const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim);
 
-  if (tensor_idx[packed_dim] < sizes[packed_dim]) {
-    nchw_out[buf_indices.x] = BUF_T(texel.x);
+  if (tidx[packed_dim] < sizes[packed_dim]) {
+    buf_out[buf_indices.x] = BUF_T(texel.x);
   }
-  if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) {
-    nchw_out[buf_indices.y] = BUF_T(texel.y);
+  if (tidx[packed_dim] + 1 < sizes[packed_dim]) {
+    buf_out[buf_indices.y] = BUF_T(texel.y);
   }
-  if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) {
-    nchw_out[buf_indices.z] = BUF_T(texel.z);
+  if (tidx[packed_dim] + 2 < sizes[packed_dim]) {
+    buf_out[buf_indices.z] = BUF_T(texel.z);
   }
-  if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) {
-    nchw_out[buf_indices.w] = BUF_T(texel.w);
+  if (tidx[packed_dim] + 3 < sizes[packed_dim]) {
+    buf_out[buf_indices.w] = BUF_T(texel.w);
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
@@ -8,14 +8,16 @@ image_to_nchw:
   parameter_names_with_default_values:
     DTYPE: float
     STORAGE: texture3d
+    TO_STAGING: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
       - VALUE: int
       - VALUE: int8
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: texture2d
   shader_variants:
-    - NAME: image_to_nchw
+    - NAME: image_to_nchw_texture3d
+    - NAME: image_to_nchw_texture2d
+      STORAGE: texture2d
+    - NAME: clone_image_to_buffer
+      TO_STAGING: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -88,6 +88,21 @@ ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) {
   return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
 }
 
+/*
+ * Get the buffer indices that contain the data of the texel that corresponds to
+ * to the provided tensor index. Since the texel have 4 elements, 4 buffer
+ * indices will be retrieved.
+ */
+ivec4 tidx_to_4bufi(
+    const ivec4 tidx,
+    const ivec4 strides,
+    const int packed_dim) {
+  int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
+      tidx.w * strides.w;
+
+  return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
+}
+
 ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) {
   return ivec4(
       nchwi % sizes.x,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
@@ -9,11 +9,11 @@ nchw_to_bitw8_image_nobitw8buffer:
     STORAGE: texture3d
     DTYPE: int8
   generate_variant_forall:
-    DTYPE:
-      - VALUE: int8
-      - VALUE: uint8
     STORAGE:
       - VALUE: texture2d
       - VALUE: texture3d
+    DTYPE:
+      - VALUE: int8
+      - VALUE: uint8
   shader_variants:
     - NAME: nchw_to_bitw8_image_nobitw8buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -22,6 +22,8 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "buf_in", DTYPE)}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
+$if not FROM_STAGING:
+  ${layout_declare_ubo(B, "ivec4", "buf_strides")}
 
 #include "indexing_utils.h"
 
@@ -32,10 +34,10 @@ const lowp ivec4 axis_map = unhash_axis_map(t_layout);
 const lowp int packed_dim = unhash_packed_dim(t_layout);
 
 VEC4_T read_texel(ivec4 tidx) {
-  const ivec4 buf_indices = tidx_to_nchwi(
-      tidx,
-      sizes,
-      packed_dim);
+  $if FROM_STAGING:
+    const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim);
+  $else:
+    const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim);
 
   VEC4_T texel = VEC4_T(0);
   if (tidx[packed_dim] < sizes[packed_dim]) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
@@ -8,14 +8,16 @@ nchw_to_image:
   parameter_names_with_default_values:
     STORAGE: texture3d
     DTYPE: float
+    FROM_STAGING: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
       - VALUE: int
       - VALUE: int8
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: texture2d
   shader_variants:
-    - NAME: nchw_to_image
+    - NAME: nchw_to_image_texture3d
+    - NAME: nchw_to_image_texture2d
+      STORAGE: texture2d
+    - NAME: clone_buffer_to_image
+      FROM_STAGING: False
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -10,12 +10,28 @@
 
 #include <executorch/backends/vulkan/runtime/graph/Logging.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
+void resize_clone_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
+  // TODO: support for when dimensionality doesn't match, i.e. clone is used to
+  // implement squeeze.
+  if (out->dim() == in->dim()) {
+    out->virtual_resize(in->sizes());
+  }
+}
+
 void add_clone_node(
     ComputeGraph& graph,
     const ValueRef in,
@@ -30,14 +46,84 @@ void add_clone_node(
       VK_KERNEL_FROM_STR(kernel_name),
       graph.create_global_wg_size(out),
       graph.create_local_wg_size(out),
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
-      {t_out->logical_limits_ubo()}));
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+      // Parameter Buffers
+      {t_out->logical_limits_ubo()},
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_clone_node));
+}
+
+void add_image_to_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef image,
+    const ValueRef buffer) {
+  std::string kernel_name = "clone_image_to_buffer";
+  add_dtype_suffix(kernel_name, graph.dtype_of(image));
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  utils::uvec3 global_wg_size = graph.create_global_wg_size(image);
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      shader,
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Input and Outputs
+      {{buffer, vkapi::kWrite}, {image, vkapi::kRead}},
+      // Parameter Buffers
+      {graph.sizes_ubo(image), graph.strides_ubo(buffer)},
+      // Specialization Constants
+      {graph.hashed_layout_of(image)},
+      // Resizing Logic
+      resize_clone_node));
+}
+
+void add_buffer_to_image_node(
+    ComputeGraph& graph,
+    const ValueRef buffer,
+    const ValueRef image) {
+  std::string kernel_name = "clone_buffer_to_image";
+  add_dtype_suffix(kernel_name, graph.dtype_of(image));
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  utils::uvec3 global_wg_size = graph.create_global_wg_size(image);
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      shader,
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Input and Outputs
+      {{image, vkapi::kWrite}, {buffer, vkapi::kRead}},
+      // Parameter Buffers
+      {graph.sizes_ubo(image), graph.strides_ubo(buffer)},
+      // Specialization Constants
+      {graph.hashed_layout_of(image)},
+      // Resizing Logic
+      resize_clone_node));
 }
 
 void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // The vulkan delegate does not support changing memory format.
-  return add_clone_node(graph, args[0], args[2]);
+  const ValueRef src = args[0];
+  const ValueRef dst = args[2];
+
+  const utils::StorageType src_storage = graph.storage_type_of(src);
+  const utils::StorageType dst_storage = graph.storage_type_of(dst);
+  if (src_storage == utils::kTexture3D && dst_storage == utils::kTexture3D) {
+    if (graph.hashed_layout_of(src) == graph.hashed_layout_of(dst)) {
+      return add_clone_node(graph, src, dst);
+    } else {
+      return add_view_node(graph, src, kDummyValueRef, dst);
+    }
+  }
+  if (src_storage == utils::kTexture3D && dst_storage == utils::kBuffer) {
+    return add_image_to_buffer_node(graph, src, dst);
+  }
+  if (src_storage == utils::kBuffer && dst_storage == utils::kTexture3D) {
+    return add_buffer_to_image_node(graph, src, dst);
+  }
+  VK_THROW("Buffer to buffer memory layout transition not supported yet!");
 }
 
 // Clone node is not the most efficient implementation for the aten.clone
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.h b/backends/vulkan/runtime/graph/ops/impl/View.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+void add_view_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef sizes,
+    ValueRef out);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -29,8 +29,8 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader(
   if (is_bitw8(v_dst.dtype()) && v_dst.storage_type() != utils::kBuffer &&
       !int8_buffer_enabled) {
     kernel_name = "nchw_to_bitw8_image_nobitw8buffer";
-    add_dtype_suffix(kernel_name, v_dst);
     add_storage_type_suffix(kernel_name, v_dst);
+    add_dtype_suffix(kernel_name, v_dst);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
 
@@ -41,8 +41,8 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader(
   }
 
   kernel_name = "nchw_to_image";
-  add_dtype_suffix(kernel_name, v_dst);
   add_storage_type_suffix(kernel_name, v_dst);
+  add_dtype_suffix(kernel_name, v_dst);
 
   return VK_KERNEL_FROM_STR(kernel_name);
 }
@@ -56,8 +56,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
   if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer &&
       !int8_buffer_enabled) {
     kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-    add_dtype_suffix(kernel_name, v_src);
     add_storage_type_suffix(kernel_name, v_src);
+    add_dtype_suffix(kernel_name, v_src);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
 
@@ -68,8 +68,8 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
   }
 
   kernel_name = "image_to_nchw";
-  add_dtype_suffix(kernel_name, v_src);
   add_storage_type_suffix(kernel_name, v_src);
+  add_dtype_suffix(kernel_name, v_src);
 
   return VK_KERNEL_FROM_STR(kernel_name);
 }
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -118,8 +118,8 @@ void record_bitw8_image_to_nchw_nobitw8buffer_op(
   utils::uvec3 global_wg_size = {buffer_len, 1, 1};
 
   std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-  add_dtype_suffix(kernel_name, v_src);
   add_storage_type_suffix(kernel_name, v_src);
+  add_dtype_suffix(kernel_name, v_src);
 
   context->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel_name),
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp