From e56fafb7e8a3ddbeace7e442bab1b7172921c91d Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Thu, 22 May 2025 10:51:57 -0700
Subject: [PATCH] Dtype selective build: mostly enable in fbcode

Pull Request resolved: https://github.com/pytorch/executorch/pull/11016

^
By removing `header_namespace` and depending on #include "selected_op_variants.h" instead of the full path #include <executorch/kernels/portable/cpu/selected_op_variants.h>

Note:
expose_operator_symbols=False only works in xplat, so add a failure message for that too. I don't think we should recommend for users to set it to true, as it prevents a library from linking multiple executorch_generated_libs (symbols will clash).
ghstack-source-id: 285663278

Differential Revision: [D75082395](https://our.internmc.facebook.com/intern/diff/D75082395/)
---
 kernels/portable/cpu/pattern/targets.bzl      |   4 +-
 kernels/portable/cpu/selective_build.h        |   2 +-
 kernels/portable/cpu/targets.bzl              |   8 +-
 kernels/portable/cpu/util/targets.bzl         |   2 +-
 shim_et/xplat/executorch/codegen/codegen.bzl  | 134 ++++++++++--------
 .../optimized/op_registration_util.bzl        |   4 -
 .../kernels/portable/op_registration_util.bzl |   4 -
 7 files changed, 82 insertions(+), 76 deletions(-)

diff --git a/kernels/portable/cpu/pattern/targets.bzl b/kernels/portable/cpu/pattern/targets.bzl
index 51a6374a5d3..5fc73ccd911 100644
--- a/kernels/portable/cpu/pattern/targets.bzl
+++ b/kernels/portable/cpu/pattern/targets.bzl
@@ -11,7 +11,7 @@ def define_common_targets():
     # build, where the portable ops are built from source and linked with :all_deps
     runtime.cxx_library(
         name = "all_deps",
-        deps = [
+        exported_deps = [
             "//executorch/kernels/portable/cpu/pattern:pattern",
             "//executorch/kernels/portable/cpu/pattern:bitwise_op",
             "//executorch/kernels/portable/cpu/pattern:comparison_op",
@@ -58,7 +58,7 @@ def define_common_targets():
             "pattern.h",
         ],
         compiler_flags = ["-Wno-missing-prototypes"],
-        deps = [
+        exported_deps = [
             "//executorch/kernels/portable/cpu/util:broadcast_util",
             "//executorch/kernels/portable/cpu/util:functional_util",
             "//executorch/runtime/kernel:kernel_includes",
diff --git a/kernels/portable/cpu/selective_build.h b/kernels/portable/cpu/selective_build.h
index be8cee0c859..6b46e009553 100644
--- a/kernels/portable/cpu/selective_build.h
+++ b/kernels/portable/cpu/selective_build.h
@@ -13,7 +13,7 @@
 #ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
 // include header generated by
 // executorch/codegen/tools/gen_selected_op_variants.py
-#include <executorch/kernels/portable/cpu/selected_op_variants.h>
+#include "selected_op_variants.h"
 #else
 // dummy implementation
 inline constexpr bool should_include_kernel_dtype(
diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl
index b428a5d107e..69db422b184 100644
--- a/kernels/portable/cpu/targets.bzl
+++ b/kernels/portable/cpu/targets.bzl
@@ -46,7 +46,11 @@ def define_common_targets():
         ],
         srcs = [],
         exported_headers = ["vec_ops.h"],
-        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/quantized/..."],
+        visibility = [
+            "//executorch/kernels/portable/...",
+            "//executorch/kernels/quantized/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
     )
 
     # Only for use by targets in this directory. Defines constants like M_PI
@@ -58,7 +62,7 @@ def define_common_targets():
             "math_constants.h",
         ],
         visibility = [
-            "//executorch/kernels/portable/cpu/...",
+            "//executorch/kernels/portable/...", "@EXECUTORCH_CLIENTS",
         ],
     )
 
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index e756a9bf282..560e0472881 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -11,7 +11,7 @@ def define_common_targets():
     # build, where the portable ops are built from source and linked with :all_deps
     runtime.cxx_library(
         name = "all_deps",
-        deps = [
+        exported_deps = [
             "//executorch/extension/threadpool:threadpool",
             "//executorch/kernels/portable/cpu/util:functional_util",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
index 4c14db670d9..df4e2f41c8f 100644
--- a/shim_et/xplat/executorch/codegen/codegen.bzl
+++ b/shim_et/xplat/executorch/codegen/codegen.bzl
@@ -1,7 +1,7 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_default_executorch_platforms", "is_xplat", "runtime", "struct_to_json")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
-load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_header_list", "portable_source_list")
-load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_header_list", "optimized_source_list")
+load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_source_list")
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_source_list")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
     "get_vec_deps",
@@ -407,29 +407,40 @@ def copy_files(genrule_name, target, file_list):
         default_outs = ["."],
     )
 
+def get_portable_lib_deps():
+    return [
+        "//executorch/kernels/portable/cpu:math_constants",
+        "//executorch/kernels/portable/cpu:scalar_utils",
+        "//executorch/kernels/portable/cpu:vec_ops",
+        "//executorch/kernels/portable/cpu/pattern:all_deps",
+        "//executorch/kernels/portable/cpu/util:all_deps",
+    ]
+
+def get_optimized_lib_deps():
+    return [
+        "//executorch/kernels/optimized/cpu:add_sub_impl",
+        "//executorch/kernels/optimized/cpu:binary_ops",
+        "//executorch/kernels/optimized/cpu:fft_utils",
+        "//executorch/kernels/optimized/cpu:moments_utils",
+        "//executorch/kernels/optimized:libblas",
+        "//executorch/kernels/optimized:libutils",
+        "//executorch/kernels/optimized:libvec",
+        "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
+        "//executorch/runtime/kernel:kernel_includes",
+    ] + get_vec_deps()
+
 def build_portable_header_lib(name, oplist_header_name, feature = None):
     """Build the portable headers into a header-only library.
     Ensures that includes work across portable and optimized libs.
-    #include "executorch/kernels/portable/cpu/<header.h>"
     """
-    # Copy portable header files.
-    portable_header_files = {}
-    genrule_name = name + "_copy_portable_header"
-    copy_files(genrule_name, "//executorch/kernels/portable/cpu:portable_header_files", portable_header_list())
-    for header in portable_header_list():
-        portable_header_files[header] = ":{}[{}]".format(genrule_name, header)
-
-    # Include dtype header.
-    portable_header_files["selected_op_variants.h"] = ":{}[selected_op_variants]".format(oplist_header_name)
-
-    # Build portable headers lib.
     runtime.cxx_library(
         name = name,
         srcs = [],
-        exported_headers = portable_header_files,
+        exported_headers = {
+            "selected_op_variants.h":":{}[selected_op_variants]".format(oplist_header_name),
+        },
         exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
-        # header_namespace is only available in xplat. See https://fburl.com/code/we2gvopk
-        header_namespace = "executorch/kernels/portable/cpu",
+        header_namespace = "",
         feature = feature,
     )
 
@@ -454,7 +465,7 @@ def build_portable_lib(name, oplist_header_name, portable_header_lib, feature =
     # library, and it blocks users like unit tests to use kernel
     # implementation directly. So we enable this for xplat only.
     compiler_flags = ["-Wno-missing-prototypes"]
-    if not expose_operator_symbols:
+    if not expose_operator_symbols and is_xplat():
         # Removing '-fvisibility=hidden' exposes operator symbols.
         # This allows operators to be called outside of the kernel registry.
         compiler_flags += ["-fvisibility=hidden"]
@@ -464,9 +475,7 @@ def build_portable_lib(name, oplist_header_name, portable_header_lib, feature =
         name = name,
         srcs = portable_source_files,
         exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
-        deps = ["//executorch/kernels/portable/cpu/pattern:all_deps", "//executorch/kernels/portable/cpu/util:all_deps"] +  [":" + portable_header_lib],
-        # header_namespace is only available in xplat. See https://fburl.com/code/we2gvopk
-        header_namespace = "executorch/kernels/portable/cpu",
+        deps = get_portable_lib_deps() + [":" + portable_header_lib],
         compiler_flags = compiler_flags,
         # WARNING: using a deprecated API to avoid being built into a shared
         # library. In the case of dynamically loading so library we don't want
@@ -492,13 +501,6 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
     for op in optimized_source_list():
         optimized_source_files.append(":{}[{}]".format(source_genrule, op))
 
-    # Copy optimized header files.
-    optimized_header_files = {}
-    header_genrule = name + "_copy_optimized_header"
-    copy_files(header_genrule, "//executorch/kernels/optimized/cpu:optimized_header_files", optimized_header_list())
-    for header in optimized_header_list():
-        optimized_header_files[header] = ":{}[{}]".format(header_genrule, header)
-
     # For shared library build, we don't want to expose symbols of
     # kernel implementation (ex torch::executor::native::tanh_out)
     # to library users. They should use kernels through registry only.
@@ -508,35 +510,17 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
     # library, and it blocks users like unit tests to use kernel
     # implementation directly. So we enable this for xplat only.
     compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed","-Wno-global-constructors","-Wno-shadow",]
-    if not expose_operator_symbols:
+    if not expose_operator_symbols and is_xplat():
         # Removing '-fvisibility=hidden' exposes operator symbols.
         # This allows operators to be called outside of the kernel registry.
         compiler_flags += ["-fvisibility=hidden"]
 
-    # Set up dependencies.
-    optimized_lib_deps = [
-        "//executorch/kernels/optimized/cpu:add_sub_impl",
-        "//executorch/kernels/optimized/cpu:binary_ops",
-        "//executorch/kernels/optimized/cpu:fft_utils",
-        "//executorch/kernels/optimized/cpu:moments_utils",
-        "//executorch/kernels/optimized:libblas",
-        "//executorch/kernels/optimized:libutils",
-        "//executorch/kernels/optimized:libvec",
-        "//executorch/kernels/portable/cpu/pattern:all_deps", 
-        "//executorch/kernels/portable/cpu/util:all_deps",
-        "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
-        "//executorch/runtime/kernel:kernel_includes",
-        ":" + portable_header_lib,
-    ] + get_vec_deps()
-    
     # Build optimized lib.
     runtime.cxx_library(
         name = name,
         srcs = optimized_source_files,
         exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
-        deps = optimized_lib_deps,
-        # header_namespace is only available in xplat. See https://fburl.com/code/we2gvopk
-        header_namespace = "executorch/kernels/optimized/cpu",
+        deps = get_portable_lib_deps() + get_optimized_lib_deps() + [":" + portable_header_lib],
         compiler_flags = compiler_flags,
         preprocessor_flags = get_vec_preprocessor_flags(),
         # sleef needs to be added as a direct dependency of the operator target when building for Android,
@@ -627,13 +611,24 @@ def executorch_generated_lib(
         deps: Additinal deps of the main C++ library. Needs to be in either `//executorch` or `//caffe2` module.
         platforms: platforms args to runtime.cxx_library (only used when in xplat)
         manual_registration: if true, generate RegisterKernels.cpp and RegisterKernels.h.
-        use_default_aten_ops_lib: If `aten_mode` is True AND this flag is True, use `torch_mobile_all_ops_et` for ATen operator library.
+        use_default_aten_ops_lib: If `aten_mode` is True AND this flag is True,
+            use `torch_mobile_all_ops_et` for ATen operator library.
         xplat_deps: Additional xplat deps, can be used to provide custom operator library.
         fbcode_deps: Additional fbcode deps, can be used to provide custom operator library.
         compiler_flags: compiler_flags args to runtime.cxx_library
-        dtype_selective_build: In additional to operator selection, dtype selective build further selects the dtypes for each operator. Can be used with model or dict selective build APIs, where dtypes can be specified. Note: this is only available in xplat.
-        feature: Product-Feature Hierarchy (PFH). For internal use only, required for FoA in production. See: https://fburl.com/wiki/2wzjpyqy
-        support_exceptions: enable try/catch wrapper around operator implemntations to make sure exceptions thrown will not bring down the process. Disable if your use case disables exceptions in the build.
+        dtype_selective_build: In additional to operator selection, dtype selective build
+            further selects the dtypes for each operator. Can be used with model or dict
+            selective build APIs, where dtypes can be specified.
+        feature: Product-Feature Hierarchy (PFH). For internal use only, required
+            for FoA in production. See: https://fburl.com/wiki/2wzjpyqy
+        expose_operator_symbols: By default, fvisibility=hidden is set for executorch kernel
+            libraries built with dtype selective build. This options removes the compiler
+            flag and allows operators to be called outside of the kernel registry.
+            NOTE: It is not recommended to set this to True, as symbols may clash (duplicate
+            symbols errors) if multiple executorch_generated_libs are included by a parent library.
+        support_exceptions: enable try/catch wrapper around operator implementations
+            to make sure exceptions thrown will not bring down the process. Disable if your
+            use case disables exceptions in the build.
     """
     if functions_yaml_target and aten_mode:
         fail("{} is providing functions_yaml_target in ATen mode, it will be ignored. `native_functions.yaml` will be the source of truth.".format(name))
@@ -641,7 +636,24 @@ def executorch_generated_lib(
     if not aten_mode and not functions_yaml_target and not custom_ops_yaml_target:
         fail("At least one of functions_yaml_target, custom_ops_yaml_target needs to be provided")
 
+    if expose_operator_symbols:
+        if not dtype_selective_build:
+            fail("""
+            expose_operator_symbols is only available in dtype selective build mode.
+            See: https://www.internalfb.com/wiki/PyTorch/Teams/Edge/PyTorch_Edge_Core_Team/Dtype_Selective_Build/""")
+
     if dtype_selective_build:
+        if not expose_operator_symbols and not is_xplat():
+            # TODO(T225169282): make this a fail once internal cases move to xplat.
+            warning("""
+                Dtype selective build with expose_operator_symbols=False works only in xplat - 
+                there are undefined symbols otherwise. Please try to use xplat, or talk to the
+                executorch team. Setting expose_operator_symbols=True is not recommended as the
+                exposed symbols may clash (duplicate symbols errors) if multiple
+                executorch_generated_libs are included by a parent library.
+                
+                Falling back to operator selective build.""")
+
         if (not "//executorch/kernels/portable:operators" in kernel_deps) and (not "//executorch/kernels/optimized:optimized_operators" in kernel_deps):
             fail("""
             !!WARNING!! Dtype selective build is available for the portable and optimized kernel libraries.
@@ -655,7 +667,7 @@ def executorch_generated_lib(
             If you have a custom kernel library, please remove `dtype_selective_build=True`
             and use regular selective build.
             """.format(kernel_deps))
-        
+
         # Dtype selective build requires that the portable/optimized kernel libraries are not passed into `deps`.
         if ("//executorch/kernels/portable:operators" in kernel_deps):
             index = 0
@@ -755,13 +767,11 @@ def executorch_generated_lib(
             platforms = platforms,
         )
 
-    portable_lib = []
-    optimized_lib = []
-    if dtype_selective_build and is_xplat():
+    if dtype_selective_build:
         # Build portable headers lib. Used for portable and optimized kernel libraries.
         portable_header_lib = name + "_portable_header_lib"
         build_portable_header_lib(portable_header_lib, oplist_header_name, feature)
-        
+
         if "//executorch/kernels/portable:operators" in kernel_deps:
             # Remove portable from kernel_deps as we're building it from source.
             kernel_deps.remove("//executorch/kernels/portable:operators")
@@ -769,16 +779,16 @@ def executorch_generated_lib(
             # Build portable lib.
             portable_lib_name = name + "_portable_lib"
             build_portable_lib(portable_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols)
-            portable_lib = [":{}".format(portable_lib_name)]
-            
+            kernel_deps.append(":{}".format(portable_lib_name))
+
         if "//executorch/kernels/optimized:optimized_operators" in kernel_deps:
             # Remove optimized from kernel_deps as we're building it from source.
             kernel_deps.remove("//executorch/kernels/optimized:optimized_operators")
-            
+
             # Build optimized lib.
             optimized_lib_name = name + "_optimized_lib"
             build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols)
-            optimized_lib = [":{}".format(optimized_lib_name)]
+            kernel_deps.append(":{}".format(optimized_lib_name))
 
     # Exports headers that declare the function signatures of the C++ functions
     # that map to entries in `functions.yaml` and `custom_ops.yaml`.
@@ -832,7 +842,7 @@ def executorch_generated_lib(
                 "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix,
                 "//executorch/runtime/core:evalue" + aten_suffix,
                 "//executorch/codegen:macros",
-            ] + deps + kernel_deps + portable_lib + optimized_lib,
+            ] + deps + kernel_deps,
             exported_deps = [
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
index dc46e4dd77e..bd76ba9f18a 100644
--- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -269,7 +269,3 @@ OPTIMIZED_ATEN_OPS = (
 def optimized_source_list():
     """All the source file names from //executorch/kernels/optimized/cpu"""
     return [op["name"] + ".cpp" for op in OPTIMIZED_ATEN_OPS]
-
-def optimized_header_list():
-    """All the header file names from //executorch/kernels/optimized/cpu"""
-    return ["binary_ops.h", "fft_utils.h", "moments_utils.h", "op_add_sub_impl.h",]
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index 3bfc7fdf00f..4e379942c52 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -1333,7 +1333,3 @@ CUSTOM_OPS = (
 def portable_source_list():
     """All the source file names from //executorch/kernels/portable/cpu/"""
     return [op["name"] + ".cpp" for op in ATEN_OPS + CUSTOM_OPS]
-
-def portable_header_list():
-    """All the header file names from //executorch/kernels/portable/cpu/"""
-    return ["selective_build.h", "scalar_utils.h", "math_constants.h", "vec_ops.h"]