Update base for Update on "[ET-VK] Allow specifying multiple storage types/memory layouts for an operator + register group norm operator"

SS-JIA · SS-JIA · commit 926a7607b715 · 2025-06-24T13:44:12.000-07:00
## Changes * Handle cases where an operator needs to specify a separate storage type / memory layout for each individual output. ## Motivation Required for the group norm operator. ## Future Work Currently, the `tag_memory_meta_pass` graph pass assumes that all tensors participating in a computation (aside from weights) will have the same storage type and memory layout. As more operators are being added, there are more exceptions to this rule. The pass may need an update in the near future to make it possible to specify required storage types and memory layouts on a more granular level. Differential Revision: [D77038781](https://our.internmc.facebook.com/intern/diff/D77038781/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.glsl
@@ -13,10 +13,6 @@
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
 ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
@@ -67,8 +63,8 @@ shared float shared_sum_sq[LOCAL_WORK_GROUP_SIZE];
  * N is the number of elements in the tensor buffer; each thread computes one
  * output element.
  *
- * Local work group size:  {1, T, 1}
- * T should be a power of 2, recommended 64 or 128 threads. This allows
+ * Local work group size:  {1, float, 1}
+ * float should be a power of 2, recommended 64 or 128 threads. This allows
  * efficient tree-based reduction in shared memory. Each local group will
  * cooperate to compute the output element.
  *
@@ -133,7 +129,7 @@ void group_norm_reduce_C_packed() {
 
     // Check bounds and load texel
     if (all(lessThan(tex_pos, in_limits))) {
-      const VEC4_T texel_val = load_texel(t_in, tex_pos);
+      const vec4 texel_val = load_texel(t_in, tex_pos);
 
       // Process all components of the texel that belong to this group
       const int texel_start_channel = global_texel_idx * 4;
@@ -181,8 +177,8 @@ void group_norm_reduce_C_packed() {
     const float rstd_val = 1.0 / sqrt(variance + epsilon);
 
     // Write to buffer-backed tensors
-    t_mean[global_idx] = T(mean_val);
-    t_rstd[global_idx] = T(rstd_val);
+    t_mean[global_idx] = mean_val;
+    t_rstd[global_idx] = rstd_val;
   }
 }
 
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -651,12 +651,18 @@ def get_native_group_norm_inputs():
     test_suite = VkTestSuite(
         [
             # (input_shape, weight_shape, bias_shape, N, C, HxW, group, eps)
+            # General test cases
             ((1, 8, 4, 4), (8), (8), 1, 8, 16, 2, 0.001),
             ((2, 8, 3, 3), (8), (8), 2, 8, 9, 4, 0.001),
             ((1, 12, 2, 2), (12), (12), 1, 12, 4, 3, 0.001),
             ((3, 16, 5, 5), (16), (16), 3, 16, 25, 8, 0.001),
+            ((3, 16, 13, 17), (16), (16), 3, 16, 13 * 17, 4, 0.001),
             ((1, 4, 7, 7), (4), (4), 1, 4, 49, 2, 0.001),
             ((2, 6, 1, 8), (6), (6), 2, 6, 8, 3, 0.001),
+            # Single group and prime number sizes
+            ((3, 7, 13, 11), (7), (7), 3, 7, 13 * 11, 1, 0.001),
+            # Each channel is it's own group and prime number sizes
+            ((1, 7, 13, 11), (7), (7), 1, 7, 13 * 11, 7, 0.001),
         ]
     )
     test_suite.layouts = [
@@ -667,6 +673,7 @@ def get_native_group_norm_inputs():
     ]
     test_suite.dtypes = [
         "at::kFloat",
+        "at::kHalf",
     ]
     test_suite.arg_storage_types = {
         "out": [None, "utils::kBuffer", "utils::kBuffer"],

Original file line number	Diff line number	Diff line change
`@@ -651,12 +651,18 @@ def get_native_group_norm_inputs():`
`651`	`651`	`test_suite = VkTestSuite(`
`652`	`652`	`[`
`653`	`653`	`# (input_shape, weight_shape, bias_shape, N, C, HxW, group, eps)`
	`654`	`+ # General test cases`
`654`	`655`	`((1, 8, 4, 4), (8), (8), 1, 8, 16, 2, 0.001),`
`655`	`656`	`((2, 8, 3, 3), (8), (8), 2, 8, 9, 4, 0.001),`
`656`	`657`	`((1, 12, 2, 2), (12), (12), 1, 12, 4, 3, 0.001),`
`657`	`658`	`((3, 16, 5, 5), (16), (16), 3, 16, 25, 8, 0.001),`
	`659`	`+ ((3, 16, 13, 17), (16), (16), 3, 16, 13 * 17, 4, 0.001),`
`658`	`660`	`((1, 4, 7, 7), (4), (4), 1, 4, 49, 2, 0.001),`
`659`	`661`	`((2, 6, 1, 8), (6), (6), 2, 6, 8, 3, 0.001),`
	`662`	`+ # Single group and prime number sizes`
	`663`	`+ ((3, 7, 13, 11), (7), (7), 3, 7, 13 * 11, 1, 0.001),`
	`664`	`+ # Each channel is it's own group and prime number sizes`
	`665`	`+ ((1, 7, 13, 11), (7), (7), 1, 7, 13 * 11, 7, 0.001),`
`660`	`666`	`]`
`661`	`667`	`)`
`662`	`668`	`test_suite.layouts = [`
`@@ -667,6 +673,7 @@ def get_native_group_norm_inputs():`
`667`	`673`	`]`
`668`	`674`	`test_suite.dtypes = [`
`669`	`675`	`"at::kFloat",`
	`676`	`+ "at::kHalf",`
`670`	`677`	`]`
`671`	`678`	`test_suite.arg_storage_types = {`
`672`	`679`	`"out": [None, "utils::kBuffer", "utils::kBuffer"],`