refactor mm and linear implementation (#4011)

copyrightly · facebook-github-bot · commit f2b0595b30ff · 2024-06-21T00:05:20.000-07:00
Summary: Pull Request resolved: #4011 Our existing `mm` uses a generalized shader for both 2d and 3d input. nathanaelsee found that edge graphs smartly convert some 3d input into 2d, i.e. the needed op becomes `mm` instead of `bmm`. To optimize performance, we split the current implementation for 2d and 3d respectively using template variants. Reviewed By: nathanaelsee Differential Revision: D58629158 fbshipit-source-id: 4300cf086424618aecf0637da59bba46db05e42f
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
@@ -13,6 +13,9 @@
 $if MAT2_IS_TRANSPOSED:
   #define MAT2_IS_TRANSPOSED
 
+$if BATCH_MODE:
+  #define BATCH_MODE
+
 #include "indexing_utils.h"
 #include "matmul.h"
 
@@ -52,12 +55,20 @@ void main() {
     return;
   }
 
-  FloatMatrix results = matmul_partial_4x4(
+  $if BATCH_MODE:
+    FloatMatrix_3d results = matmul_partial_4x4x4(
       im_mat1,
       im_mat2,
       pos,
       out_sizes[2],
       in_limits[0]);
+  $else:
+    FloatMatrix_2d results = matmul_partial_4x4(
+        im_mat1,
+        im_mat2,
+        pos,
+        out_sizes[2],
+        in_limits[0]);
 
   for (int idx_c = 0; idx_c < FOUR; idx_c++) {
     for (int idx_r = 0; idx_r < FOUR; idx_r++) {
@@ -71,14 +82,21 @@ void main() {
           self_sizes.y == 1);
 
       // results is in transposed order w.r.t. the desired output
-      imageStore(
+      $if BATCH_MODE:
+        imageStore(
           im_out,
           out_pos,
           vec4(
               beta * self_texel.x + alpha * results.data[idx_c][idx_r][0],
               beta * self_texel.x + alpha * results.data[idx_c][idx_r][1],
               beta * self_texel.x + alpha * results.data[idx_c][idx_r][2],
               beta * self_texel.x + alpha * results.data[idx_c][idx_r][3]));
+      $else:
+        imageStore(
+            im_out,
+            out_pos,
+            vec4(
+                beta * self_texel.x + alpha * results.data[idx_c][idx_r], 0.0, 0.0, 0.0));
     }
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
@@ -10,6 +10,7 @@ addmm_optimized:
     NDIM: 3
     PACKING: C_packed
     MAT2_IS_TRANSPOSED: false
+    BATCH_MODE: false
   generate_variant_forall:
     DTYPE:
       - VALUE: float
@@ -18,3 +19,8 @@ addmm_optimized:
     - NAME: addmm_optimized
     - NAME: linear_optimized
       MAT2_IS_TRANSPOSED: true
+    - NAME: batch_addmm_optimized
+      BATCH_MODE: true
+    - NAME: batch_linear_optimized
+      MAT2_IS_TRANSPOSED: true
+      BATCH_MODE: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.h b/backends/vulkan/runtime/graph/ops/glsl/matmul.h
@@ -12,7 +12,11 @@
 
 // we avoid mat4 and vec4 usage here as they compile to much less efficient
 // SPIR-V
-struct FloatMatrix {
+struct FloatMatrix_2d {
+  float data[FOUR][FOUR];
+};
+
+struct FloatMatrix_3d {
   float data[FOUR][FOUR][FOUR];
 };
 
@@ -146,13 +150,56 @@ vec4 get_texel_C_packed(
   return self_texel;
 }
 
-FloatMatrix matmul_partial_4x4(
+FloatMatrix_2d matmul_partial_4x4(
+    sampler3D im_mat1,
+    sampler3D im_mat2,
+    const ivec3 pos,
+    const int batch_size,
+    const int K_texel_len) {
+  FloatMatrix_2d results;
+  for (int i = 0; i < FOUR; i++) {
+    for (int j = 0; j < FOUR; j++) {
+      results.data[i][j] = 0.0f;
+    }
+  }
+  vec4 im_mat1_partial_load[FOUR];
+  vec4 im_mat2_partial_load[FOUR];
+
+  for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) {
+    for (int offset = 0; offset < FOUR; offset++) {
+      // read and cache 4x4 tile of im_mat1
+      const int mat1_y = (FOUR * pos.y) + offset;
+      const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, 0);
+      im_mat1_partial_load[offset] = texelFetch(im_mat1, mat1_pos, 0);
+      // read and cache 4x4 tile of im_mat2
+#ifdef MAT2_IS_TRANSPOSED
+      const int mat2_y = (FOUR * pos.x) + offset;
+      const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0);
+      im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
+#else
+      const int mat2_x = (FOUR * pos.x) + offset;
+      const ivec3 mat2_pos = ivec3(mat2_x, mat1_x, 0);
+      im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
+#endif
+    }
+    // perform partial dot products and add partial result to results
+    for (int out_row = 0; out_row < FOUR; out_row++) {
+      for (int out_col = 0; out_col < FOUR; out_col++) {
+        results.data[out_row][out_col] +=
+            dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]);
+      }
+    }
+  }
+  return results;
+}
+
+FloatMatrix_3d matmul_partial_4x4x4(
     sampler3D im_mat1,
     sampler3D im_mat2,
     const ivec3 pos,
     const int batch_size,
     const int K_texel_len) {
-  FloatMatrix results;
+  FloatMatrix_3d results;
   for (int i = 0; i < FOUR; i++) {
     for (int j = 0; j < FOUR; j++) {
       for (int k = 0; k < FOUR; k++) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
@@ -13,6 +13,9 @@
 $if MAT2_IS_TRANSPOSED:
   #define MAT2_IS_TRANSPOSED
 
+$if BATCH_MODE:
+  #define BATCH_MODE
+
 #include "indexing_utils.h"
 #include "matmul.h"
 
@@ -41,27 +44,41 @@ void main() {
     return;
   }
 
-  FloatMatrix results = matmul_partial_4x4(
-      im_mat1,
-      im_mat2,
-      pos,
-      out_sizes[2],
-      in_limits[0]);
+  $if BATCH_MODE:
+    FloatMatrix_3d results = matmul_partial_4x4x4(
+        im_mat1,
+        im_mat2,
+        pos,
+        out_sizes[2],
+        in_limits[0]);
+  $else:
+    FloatMatrix_2d results = matmul_partial_4x4(
+        im_mat1,
+        im_mat2,
+        pos,
+        out_sizes[2],
+        in_limits[0]);
 
   for (int idx_c = 0; idx_c < FOUR; idx_c++) {
     for (int idx_r = 0; idx_r < FOUR; idx_r++) {
       const ivec3 out_pos =
           ivec3(idx_r + FOUR * pos.x, idx_c + FOUR * pos.y, pos.z);
 
       // results is in transposed order w.r.t. the desired output
-      imageStore(
+      $if BATCH_MODE:
+        imageStore(
           im_out,
           out_pos,
           vec4(
               results.data[idx_c][idx_r][0],
               results.data[idx_c][idx_r][1],
               results.data[idx_c][idx_r][2],
               results.data[idx_c][idx_r][3]));
+      $else:
+        imageStore(
+            im_out,
+            out_pos,
+            vec4(results.data[idx_c][idx_r], 0.0, 0.0, 0.0));
     }
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
@@ -10,6 +10,7 @@ matmul_optimized:
     NDIM: 3
     PACKING: C_packed
     MAT2_IS_TRANSPOSED: false
+    BATCH_MODE: false
   generate_variant_forall:
     DTYPE:
       - VALUE: float
@@ -18,3 +19,8 @@ matmul_optimized:
     - NAME: matmul_optimized
     - NAME: matmul_transposed_optimized
       MAT2_IS_TRANSPOSED: true
+    - NAME: batch_matmul_optimized
+      BATCH_MODE: true
+    - NAME: batch_matmul_transposed_optimized
+      MAT2_IS_TRANSPOSED: true
+      BATCH_MODE: true
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -169,6 +169,12 @@ void add_addmm_optimized_node(
   std::string kernel_name = graph.get_bool(mat2_is_transposed)
       ? "linear_optimized"
       : "addmm_optimized";
+
+  int mat1_dims = graph.sizes_of(mat1_W_packed).size();
+  if (mat1_dims == 3) {
+    kernel_name = "batch_" + kernel_name;
+  }
+
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -134,6 +134,12 @@ void add_matmul_optimized_node(
   std::string kernel_name = mat2_is_transposed_val
       ? "matmul_transposed_optimized"
       : "matmul_optimized";
+
+  int mat1_dims = graph.sizes_of(mat1_W_packed).size();
+  if (mat1_dims == 3) {
+    kernel_name = "batch_" + kernel_name;
+  }
+
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   graph.execute_nodes().emplace_back(new ExecuteNode(