From edccca4935f9bc3d090ec134614aba8e2aa629c2 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 30 Oct 2024 15:31:01 -0700
Subject: [PATCH 1/3] [Executorch][Kernels] Build optimized lib with -O2

Pull Request resolved: https://github.com/pytorch/executorch/pull/6517

Dont rely on top level config for optimization and rather set our own for optimized kernels
ghstack-source-id: 251021623
@exported-using-ghexport

Differential Revision: [D64910575](https://our.internmc.facebook.com/intern/diff/D64910575/)
---
 kernels/optimized/lib_defs.bzl                 |  5 +++++
 kernels/optimized/op_registration_util.bzl     |  6 +++++-
 .../kernels/portable/op_registration_util.bzl  | 18 ++++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
index d9721e5055d..c3799f7db51 100644
--- a/kernels/optimized/lib_defs.bzl
+++ b/kernels/optimized/lib_defs.bzl
@@ -2,6 +2,10 @@ load("@fbsource//tools/build_defs:default_platform_defs.bzl", "DEVSERVER_PLATFOR
 load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
 load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(
+    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
+    "get_compiler_optimization_flags",
+)
 
 # Because vec exists as a collection of header files, compile and preprocessor
 # flags applied to the vec target do not have any effect, since no compilation
@@ -121,6 +125,7 @@ def define_libs():
             exported_headers = native.glob([
                 "blas/**/*.h",
             ]),
+            compiler_flags = get_compiler_optimization_flags(),
             header_namespace = "executorch/kernels/optimized",
             visibility = [
                 "//executorch/...",
diff --git a/kernels/optimized/op_registration_util.bzl b/kernels/optimized/op_registration_util.bzl
index c969aa81a9a..6e74836bb79 100644
--- a/kernels/optimized/op_registration_util.bzl
+++ b/kernels/optimized/op_registration_util.bzl
@@ -4,6 +4,10 @@ load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
     "get_vec_android_preprocessor_flags",
 )
+load(
+    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
+    "get_compiler_optimization_flags",
+)
 
 def op_target(name, deps = []):
     """Registers an optimized implementation for an operator overload group.
@@ -87,7 +91,7 @@ def define_op_library(name, deps):
         ],
         # kernels often have helpers with no prototypes just disabling the warning here as the headers
         # are codegend and linked in later
-        compiler_flags = ["-Wno-missing-prototypes"],
+        compiler_flags = ["-Wno-missing-prototypes"] + get_compiler_optimization_flags(),
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
         ] + augmented_deps,
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
index ef170d62970..c5349b9475b 100644
--- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -1,6 +1,24 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "is_xplat", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 
+def get_compiler_optimization_flags():
+    # various ovr_configs are not available in oss
+    if not runtime.is_oss:
+      compiler_flags = select({
+        "DEFAULT": [],
+        "ovr_config//os:android-arm64": [
+              "-O2",
+        ],
+        "ovr_config//os:iphoneos": [
+            "-O2",
+        ],
+        "ovr_config//os:macos-arm64": [
+            "-O2",
+        ],
+      })
+      return compiler_flags
+    return []
+
 def op_target(name, deps = [], android_deps = [], _allow_third_party_deps = False, _aten_mode_deps = []):
     """Registers an implementation of an operator overload group.
 

From 4348b429e82ff4ffecfa29da9184430d7aaabf31 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 30 Oct 2024 15:31:02 -0700
Subject: [PATCH 2/3] [Executorch][llm] Compile custom op with -O2

Pull Request resolved: https://github.com/pytorch/executorch/pull/6518

This enable some optimizations for inlining vectorized lib functions
ghstack-source-id: 251021622
@exported-using-ghexport

Differential Revision: [D64910576](https://our.internmc.facebook.com/intern/diff/D64910576/)
---
 extension/llm/custom_ops/targets.bzl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
index c2843f5c2f7..6b9f9cb959c 100644
--- a/extension/llm/custom_ops/targets.bzl
+++ b/extension/llm/custom_ops/targets.bzl
@@ -1,4 +1,9 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(
+    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
+    "get_compiler_optimization_flags",
+)
+
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -34,7 +39,7 @@ def define_common_targets():
                 "//executorch/kernels/portable/cpu/util:reduce_util",
                 "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
             ],
-            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"],
+            compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"] + get_compiler_optimization_flags(),
             visibility = [
                 "//executorch/...",
                 "//executorch/extension/llm/custom_ops/...",

From 54f6c6a649e318b6059b12a877e80ff297cff288 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 30 Oct 2024 15:31:04 -0700
Subject: [PATCH 3/3] [Executorch][Portable] Compile portable ops with -O2

Pull Request resolved: https://github.com/pytorch/executorch/pull/6519

prefill toks: 21
decode: 11
after: e2e 621ms, prefil:: 370ms, decode: 43.84 tok/sec
ghstack-source-id: 251021625
@exported-using-ghexport

Differential Revision: [D64910578](https://our.internmc.facebook.com/intern/diff/D64910578/)
---
 shim/xplat/executorch/kernels/portable/op_registration_util.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
index c5349b9475b..6a25f35c304 100644
--- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -150,7 +150,7 @@ def define_op_library(name, deps, android_deps, aten_target, _allow_third_party_
             # library, and it blocks users like unit tests to use kernel
             # implementation directly. So we enable this for xplat only.
             ["-fvisibility=hidden"] if is_xplat() else []
-        ),
+        ) + get_compiler_optimization_flags(),
         deps = [
             "//executorch/runtime/kernel:kernel_includes" + aten_suffix,
         ] + deps,