Update on "[ET-VK] Implement `native_group_norm"

SS-JIA · SS-JIA · commit 4d40abed4ddc · 2025-06-24T13:44:11.000-07:00
## Changes * Add implementation for the group norm operator. The operator is implemented via a 2 stage implementation. First, a reduction operator is executed to calculate the mean and standard deviation of each channel group. Then, the normalization is applied in an elementwise fashion. Differential Revision: [D77038778](https://our.internmc.facebook.com/intern/diff/D77038778/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.glsl
@@ -13,10 +13,6 @@
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
 ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
@@ -67,8 +63,8 @@ shared float shared_sum_sq[LOCAL_WORK_GROUP_SIZE];
  * N is the number of elements in the tensor buffer; each thread computes one
  * output element.
  *
- * Local work group size:  {1, T, 1}
- * T should be a power of 2, recommended 64 or 128 threads. This allows
+ * Local work group size:  {1, float, 1}
+ * float should be a power of 2, recommended 64 or 128 threads. This allows
  * efficient tree-based reduction in shared memory. Each local group will
  * cooperate to compute the output element.
  *
@@ -133,7 +129,7 @@ void group_norm_reduce_C_packed() {
 
     // Check bounds and load texel
     if (all(lessThan(tex_pos, in_limits))) {
-      const VEC4_T texel_val = load_texel(t_in, tex_pos);
+      const vec4 texel_val = load_texel(t_in, tex_pos);
 
       // Process all components of the texel that belong to this group
       const int texel_start_channel = global_texel_idx * 4;
@@ -181,8 +177,8 @@ void group_norm_reduce_C_packed() {
     const float rstd_val = 1.0 / sqrt(variance + epsilon);
 
     // Write to buffer-backed tensors
-    t_mean[global_idx] = T(mean_val);
-    t_rstd[global_idx] = T(rstd_val);
+    t_mean[global_idx] = mean_val;
+    t_rstd[global_idx] = rstd_val;
   }
 }
 
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -651,12 +651,18 @@ def get_native_group_norm_inputs():
     test_suite = VkTestSuite(
         [
             # (input_shape, weight_shape, bias_shape, N, C, HxW, group, eps)
+            # General test cases
             ((1, 8, 4, 4), (8), (8), 1, 8, 16, 2, 0.001),
             ((2, 8, 3, 3), (8), (8), 2, 8, 9, 4, 0.001),
             ((1, 12, 2, 2), (12), (12), 1, 12, 4, 3, 0.001),
             ((3, 16, 5, 5), (16), (16), 3, 16, 25, 8, 0.001),
+            ((3, 16, 13, 17), (16), (16), 3, 16, 13 * 17, 4, 0.001),
             ((1, 4, 7, 7), (4), (4), 1, 4, 49, 2, 0.001),
             ((2, 6, 1, 8), (6), (6), 2, 6, 8, 3, 0.001),
+            # Single group and prime number sizes
+            ((3, 7, 13, 11), (7), (7), 3, 7, 13 * 11, 1, 0.001),
+            # Each channel is it's own group and prime number sizes
+            ((1, 7, 13, 11), (7), (7), 1, 7, 13 * 11, 7, 0.001),
         ]
     )
     test_suite.layouts = [
@@ -667,6 +673,7 @@ def get_native_group_norm_inputs():
     ]
     test_suite.dtypes = [
         "at::kFloat",
+        "at::kHalf",
     ]
     test_suite.arg_storage_types = {
         "out": [None, "utils::kBuffer", "utils::kBuffer"],

Original file line number	Diff line number	Diff line change
`@@ -651,12 +651,18 @@ def get_native_group_norm_inputs():`
`651`	`651`	`test_suite = VkTestSuite(`
`652`	`652`	`[`
`653`	`653`	`# (input_shape, weight_shape, bias_shape, N, C, HxW, group, eps)`
	`654`	`+ # General test cases`
`654`	`655`	`((1, 8, 4, 4), (8), (8), 1, 8, 16, 2, 0.001),`
`655`	`656`	`((2, 8, 3, 3), (8), (8), 2, 8, 9, 4, 0.001),`
`656`	`657`	`((1, 12, 2, 2), (12), (12), 1, 12, 4, 3, 0.001),`
`657`	`658`	`((3, 16, 5, 5), (16), (16), 3, 16, 25, 8, 0.001),`
	`659`	`+ ((3, 16, 13, 17), (16), (16), 3, 16, 13 * 17, 4, 0.001),`
`658`	`660`	`((1, 4, 7, 7), (4), (4), 1, 4, 49, 2, 0.001),`
`659`	`661`	`((2, 6, 1, 8), (6), (6), 2, 6, 8, 3, 0.001),`
	`662`	`+ # Single group and prime number sizes`
	`663`	`+ ((3, 7, 13, 11), (7), (7), 3, 7, 13 * 11, 1, 0.001),`
	`664`	`+ # Each channel is it's own group and prime number sizes`
	`665`	`+ ((1, 7, 13, 11), (7), (7), 1, 7, 13 * 11, 7, 0.001),`
`660`	`666`	`]`
`661`	`667`	`)`
`662`	`668`	`test_suite.layouts = [`
`@@ -667,6 +673,7 @@ def get_native_group_norm_inputs():`
`667`	`673`	`]`
`668`	`674`	`test_suite.dtypes = [`
`669`	`675`	`"at::kFloat",`
	`676`	`+ "at::kHalf",`
`670`	`677`	`]`
`671`	`678`	`test_suite.arg_storage_types = {`
`672`	`679`	`"out": [None, "utils::kBuffer", "utils::kBuffer"],`