Update base for Update on "[ET-VK] New implementation of cat operator"

SS-JIA · SS-JIA · commit 2c6915e4831e · 2025-06-12T11:40:39.000-07:00
## Changes * Introduce `concat_texture.glsl` and `concat_buffer.glsl` to implement the `torch.cat` operator * Introduce `Concat.cpp` to replace `Cat.cpp` * Fix a bug with channels-packed buffer tensors where input data would be copied incorrectly with multiple dims have a stride of 1 ## Motivation > * Introduce `concat_texture.glsl` and `concat_buffer.glsl` to implement the `torch.cat` operator > * Introduce `Concat.cpp` to replace `Cat.cpp` The existing implementation of `torch.cat` uses the copy_channel_offset` shaders. However, these shaders have a critical bug where the output tensor is passed in separately with difference access types, i.e. ``` graph.execute_nodes().emplace_back(new DispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), global_size, local_size, // Inputs and Outputs { {out, vkapi::kWrite}, {out, vkapi::kRead}, {in, vkapi::kRead}, }, ``` This creates many validation layer errors because the memory barriers for the resource cannot be formed properly. The shader essentially relies on undefined behaviour to work correctly. The result is that the `cat` operator produces incorrect result on many platforms. Rather than fix the `copy_offset` shaders, I decided to just introduce new shaders to perform the concat operation. The new implementation handles both buffer and texture inputs and is agnostic to memory layout. Differential Revision: [D76305343](https://our.internmc.facebook.com/intern/diff/D76305343/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -16,8 +16,6 @@
 
 #include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
 
-#include <iostream>
-
 namespace vkcompute {
 namespace api {
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -253,13 +253,13 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
  * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1)
  */
 #define unhash_axis_map(hash) \
-  ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))
+  (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)))
 
 /*
  *
  */
 #define unhash_dim_order(hash) \
-  ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))
+  (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)))
 
 #define unhash_packed_dim(hash) int(hash >> 16 & 0xf)
 
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp
@@ -32,8 +32,8 @@ BufferBindInfo::BufferBindInfo(
 
 BufferBindInfo::BufferBindInfo(
     const VulkanBuffer& buffer_p,
-    const uint32_t offset_p,
-    const uint32_t range_p)
+    const size_t offset_p,
+    const size_t range_p)
     : handle(buffer_p.handle()),
       offset(buffer_p.mem_offset() + offset_p),
       range(range_p) {
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h
@@ -36,8 +36,8 @@ struct BufferBindInfo final {
   BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u);
   BufferBindInfo(
       const VulkanBuffer& buffer_p,
-      const uint32_t offset_p,
-      const uint32_t range_p);
+      const size_t offset_p,
+      const size_t range_p);
 };
 
 struct ParamsBindList final {