Update on "[ET-VK] Minor performance improvements to native layer norm."

trivedivivek · trivedivivek · commit 9cb166b83895 · 2025-04-07T11:13:53.000-07:00
This diff introduces minor performance improvements to the native layer norm function in the Vulkan backend of Executorch. In this new approach: The mean and variance values are calculated in 2 separate passes. Shader is dispatched based on input texture size, and input texel is read and stored in shared memory. Input stored in shard memory is then summed up using a reduce function. This implementation better utilizes a GPUs parallel processing capabilities. Differential Revision: [D72430290](https://our.internmc.facebook.com/intern/diff/D72430290/) [ghstack-poisoned]
diff --git a/backends/xnnpack/operators/op_slice_copy.py b/backends/xnnpack/operators/op_slice_copy.py
@@ -69,7 +69,9 @@ def define_node(
             output_shape = [output_shape[i] for i in PERM_NCHW_TO_NHWC]
             dim_of_slice = PERM_NHWC_TO_NCHW[dim_of_slice]
 
-        slice_begin_index = cast(int, node.args[2])
+        slice_begin_index = 0
+        if len(node.args) > 2 and node.args[2]:
+            slice_begin_index = cast(int, node.args[2])
         if slice_begin_index < 0:
             slice_begin_index = input_shape[dim_of_slice] + slice_begin_index
 
diff --git a/backends/xnnpack/test/ops/test_slice_copy.py b/backends/xnnpack/test/ops/test_slice_copy.py
@@ -69,6 +69,18 @@ def forward(self, x):
         # Note that two of the slices are optimized away as they are identity.
         self._test_slice_copy(ConvSlice(), inputs, 4, 2)
 
+    def test_fp32_slice_copy_default_start(self):
+        """
+        XNNPACK supports default start in slice op.
+        """
+
+        class Slice(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.slice.Tensor(x, 0, None, 2)
+
+        inputs = (torch.randn(5, 5),)
+        self._test_slice_copy(Slice(), inputs, 1, 1)
+
     def test_fp32_slice_copy_stride_non_1(self):
         """
         XNNPACK does not support strided slicing.
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
@@ -17,6 +17,6 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
         deps = [
-            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/extension/threadpool:threadpool",
         ],
     )
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
@@ -20,7 +20,7 @@ def define_common_targets():
     ] + (["fb/threadpool_use_n_threads.h"] if not runtime.is_oss else [])
 
     runtime.cxx_library(
-        name = "threadpool",
+        name = "threadpool_lib",
         srcs = _THREADPOOL_SRCS,
         deps = [
             "//executorch/runtime/core:core",
@@ -45,6 +45,38 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "threadpool",
+        # TODO: OSS doesn't have os:iphoneos. Sync buck2 prelude
+        # update to add it and remove duplication.
+        exported_deps = (select({
+            # Major operating systems should be able to use threadpool.
+            "ovr_config//os:linux": [":threadpool_lib"],
+            "ovr_config//os:macos": [":threadpool_lib"],
+            "ovr_config//os:windows": [":threadpool_lib"],
+            "ovr_config//os:android": [":threadpool_lib"],
+            "ovr_config//os:iphoneos": [":threadpool_lib"],
+            # Machines without an operating system shouldn't.
+            "ovr_config//os:none": ["//executorch/runtime/kernel:thread_parallel_interface"],
+            # If we don't know what it is, disable threadpool out of caution.
+            "DEFAULT": ["//executorch/runtime/kernel:thread_parallel_interface"],
+        }) if not runtime.is_oss else select({
+            # Major operating systems should be able to use threadpool.
+            "ovr_config//os:linux": [":threadpool_lib"],
+            "ovr_config//os:macos": [":threadpool_lib"],
+            "ovr_config//os:windows": [":threadpool_lib"],
+            "ovr_config//os:android": [":threadpool_lib"],
+            # Machines without an operating system shouldn't.
+            "ovr_config//os:none": ["//executorch/runtime/kernel:thread_parallel_interface"],
+            # If we don't know what it is, disable threadpool out of caution.
+            "DEFAULT": ["//executorch/runtime/kernel:thread_parallel_interface"],
+        })),
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     runtime.cxx_library(
         name = "cpuinfo_utils",
         srcs = [
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
@@ -107,8 +107,8 @@ _OPTIMIZED_ATEN_OPS = (
     op_target(
         name = "op_where",
         deps = [
+            "//executorch/extension/threadpool:threadpool",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
     ),
 )
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
@@ -232,9 +232,9 @@ def define_libs(is_fbcode=False):
                 "DEFAULT": [],
             }) + LIBBLAS_DEPS,
             exported_deps = [
+                "//executorch/extension/threadpool:threadpool",
                 "//executorch/kernels/optimized:libutils",
                 "//executorch/runtime/core/exec_aten:lib",
-                "//executorch/runtime/kernel:thread_parallel_interface",
             ],
             **get_apple_framework_deps_kwargs(is_fbcode),
         )
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
@@ -12,6 +12,7 @@ def define_common_targets():
     runtime.cxx_library(
         name = "all_deps",
         deps = [
+            "//executorch/extension/threadpool:threadpool",
             "//executorch/kernels/portable/cpu/util:functional_util",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
             "//executorch/kernels/portable/cpu/util:kernel_ops_util",
@@ -32,7 +33,6 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:slice_util",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
             "//executorch/kernels/portable/cpu/util:upsample_util",
-            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
         visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
     )
@@ -111,7 +111,7 @@ def define_common_targets():
             ":broadcast_util",
             ":dtype_util",
             "//executorch/runtime/kernel:kernel_runtime_context",
-            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/extension/threadpool:threadpool",
         ],
         deps = [
             "//executorch/kernels/portable/cpu:scalar_utils",
@@ -245,7 +245,7 @@ def define_common_targets():
         srcs = [],
         exported_headers = ["functional_util.h"],
         exported_deps = [
-            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/extension/threadpool:threadpool",
         ],
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
@@ -319,7 +319,7 @@ def define_common_targets():
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
             ],
             exported_deps = [
-                "//executorch/runtime/kernel:thread_parallel_interface",
+                "//executorch/extension/threadpool:threadpool",
             ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
             visibility = [
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
@@ -59,9 +59,9 @@ def define_common_targets():
             "//executorch/runtime/core/portable_type/c10/c10:c10",
             "//executorch/runtime/platform:platform",
         ],
+        # Don't depend on this target, depend on //executorch/extension/threadpool:threadpool.
         visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
+            "//executorch/extension/threadpool/...",
         ],
     )
 

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,6 @@ def define_common_targets():`
`17`	`17`	`"@EXECUTORCH_CLIENTS",`
`18`	`18`	`],`
`19`	`19`	`deps = [`
`20`		`- "//executorch/runtime/kernel:thread_parallel_interface",`
	`20`	`+ "//executorch/extension/threadpool:threadpool",`
`21`	`21`	`],`
`22`	`22`	`)`