Integrate axis mapping into binary op (#5408)

nathanaelsee · facebook-github-bot · commit 3e2cfc7b34d4 · 2024-09-17T12:32:59.000-07:00
Summary: Pull Request resolved: #5408 Update binary op to support axis mapped textures. Reviewed By: SS-JIA Differential Revision: D62622013 fbshipit-source-id: 070ce40e22f4fca4d438d8dd3a33887bee8fc78a
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -19,38 +19,45 @@
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(2, "r", "t_other", DTYPE, STORAGE)}
-${layout_declare_ubo(3, "ivec4", "out_sizes")}
-${layout_declare_ubo(4, "ivec4", "in_sizes")}
-${layout_declare_ubo(5, "ivec4", "other_sizes")}
-${layout_declare_ubo(6, "ivec2", "broadcast_params")}
-${layout_declare_ubo(7, "float", "alpha")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_axis_map")}
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+${layout_declare_ubo(B, "ivec4", "in_axis_map")}
+${layout_declare_ubo(B, "ivec4", "other_sizes")}
+${layout_declare_ubo(B, "ivec4", "other_axis_map")}
+${layout_declare_ubo(B, "ivec2", "broadcast_params")}
+${layout_declare_ubo(B, "float", "alpha")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 void main() {
+  // pos is physical (x, y, z), as global workgroup uses image extents
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
+  // physical pos (x, y, z) -> logical (w, c, h, n) output
+  const ivec4 idx = to_tensor_idx(pos, out_sizes, out_axis_map, packed_dim);
 
   if (any(greaterThanEqual(idx, out_sizes))) {
     return;
   }
 
+  // broadcast on logical sizes
   ivec4 in_idx = broadcast_indices(idx, in_sizes);
-  VEC4_T in_texel = VEC4_T(texelFetch(
+  VEC4_T in_texel = VEC4_T(load_texel(
     t_in,
-    to_texture_pos(in_idx, in_sizes, packed_dim),
-    0));
+    // read axis mapped texel
+    to_texture_pos(in_idx, in_sizes, in_axis_map, packed_dim)));
 
+  // broadcast on logical sizes
   ivec4 other_idx = broadcast_indices(idx, other_sizes);
-  VEC4_T other_texel = VEC4_T(texelFetch(
+  VEC4_T other_texel = VEC4_T(load_texel(
     t_other,
-    to_texture_pos(other_idx, other_sizes, packed_dim),
-    0));
+    // read axis mapped texel
+    to_texture_pos(other_idx, other_sizes, other_axis_map, packed_dim)));
 
   // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment.
   if (broadcast_params.x > 0) {
@@ -60,5 +67,7 @@ void main() {
     other_texel = other_texel.xxxx;
   }
 
-  imageStore(t_out, pos, VEC4_T(op(in_texel, other_texel, alpha)));
+  imageStore(t_out,
+    to_texture_pos(idx, out_sizes, out_axis_map, packed_dim),
+    VEC4_T(op(in_texel, other_texel, alpha)));
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -85,8 +85,11 @@ void add_binary_op_node(
        {{arg1, arg2}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
       {t_out->sizes_ubo(),
+       t_out->axis_map_ubo(),
        t_in1->sizes_ubo(),
+       t_in1->axis_map_ubo(),
        t_in2->sizes_ubo(),
+       t_in2->axis_map_ubo(),
        graph.create_params_buffer(broadcast_params),
        graph.create_params_buffer(alpha_val)},
       // Specialization Constants
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -49,6 +49,7 @@ def get_binary_elementwise_inputs():
             ((S, S1, S2), (S, S1, S2)),
             ((S, S1, S2), (S, S1, 1), 2.0),
             ((S, S1, S2), (S, 1, S2), 2.0),
+            ((XS, S, S1, S2), (XS, S, 1, 1), 2.0),
         ]
     )
     test_suite.layouts = [
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -204,6 +204,16 @@ def forward(self, x, y, w):
 
         self.lower_module_and_test_output(add_module, sample_inputs)
 
+        sample_inputs = (
+            torch.rand(size=(4, 5, 2, 3), dtype=torch.float32),
+            torch.rand(size=(4, 5, 2, 3), dtype=torch.float32),
+            torch.rand(
+                size=(2, 3), dtype=torch.float32
+            ),  # test broadcasting on packed dim
+        )
+
+        self.lower_module_and_test_output(add_module, sample_inputs)
+
     def test_vulkan_backend_add_int(self):
         class AddIntModule(torch.nn.Module):
             def __init__(self):
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1399,6 +1399,7 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) {
 TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   GraphConfig config;
   ComputeGraph graph(config);
+  size_t expected_vma_allocation_count = 0;
 
   std::vector<int64_t> size_big = {12, 64, 64};
   std::vector<int64_t> size_small = {12, 64, 64};
@@ -1417,7 +1418,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   // +2: t.sizes_ubo() for each staging shader
   // +2: t.axis_map_ubo() for each staging shader
   // +2: staging buffer for each input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 6);
+  expected_vma_allocation_count += 6;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef c = graph.add_tensor(
       size_big,
@@ -1427,16 +1429,22 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   auto addFn = VK_GET_OP_FN("aten.add.Tensor");
   addFn(graph, {a.value, b.value, kDummyValueRef, c});
 
+  // +2: alpha UBO, broadcast UBO for arithmetic shader
+  // +1: t.sizes_ubo() for arithmetic shader output c
+  // +1: t.axis_map_ubo() for arithmetic shader output c
+  expected_vma_allocation_count += 4;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
+
   IOValueRef d = graph.add_input_tensor(
       size_small,
       vkapi::kFloat,
       /*shared_object_idx = */ 2);
 
-  // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() uniform buffer for staging shader
   // +1: t.axis_map_ubo() uniform buffer for staging shader
   // +1: staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 12);
+  expected_vma_allocation_count += 3;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef e = graph.add_tensor(
       size_big,
@@ -1446,21 +1454,26 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
   mulFn(graph, {c, d.value, e});
 
+  // +2: alpha UBO, broadcast UBO for arithmetic shader
+  // +1: t.sizes_ubo() for arithmetic shader output e
+  // +1: t.axis_map_ubo() for arithmetic shader output e
+  expected_vma_allocation_count += 4;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
+
   IOValueRef out = {};
   out.value = e;
   out.staging = graph.set_output_tensor(out.value);
 
-  // +2: alpha UBO, broadcast UBO for arithmetic shader
-  // +1: t.sizes_ubo() for staging shader
-  // +1: t.axis_map_ubo() for staging shader
-  // +1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 17);
+  // +1: staging buffer for the output tensor
+  expected_vma_allocation_count += 1;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   graph.prepare();
   graph.encode_execute();
 
   // +3: shared memory allocations for tensors
-  EXPECT_TRUE(get_vma_allocation_count() == 20);
+  expected_vma_allocation_count += 3;
+  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   // Run graph
 

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ def get_binary_elementwise_inputs():`
`49`	`49`	`((S, S1, S2), (S, S1, S2)),`
`50`	`50`	`((S, S1, S2), (S, S1, 1), 2.0),`
`51`	`51`	`((S, S1, S2), (S, 1, S2), 2.0),`
	`52`	`+ ((XS, S, S1, S2), (XS, S, 1, 1), 2.0),`
`52`	`53`	`]`
`53`	`54`	`)`
`54`	`55`	`test_suite.layouts = [`