From 247557a296c9dd64f13d828d9af76b75c70d1216 Mon Sep 17 00:00:00 2001 From: lucylq Date: Mon, 19 May 2025 09:37:54 -0700 Subject: [PATCH] Dtype selective build for optimized ops Pull Request resolved: https://github.com/pytorch/executorch/pull/10878 Add dtype selective build for optimized ops. Follows the same process as portable, where we copy the source files and rebuild the library. 1. Generalize copy genrule for portable/optimized/source/header. 2. Copy optimized source files + headers. 3. Build optimized ops using source files, dependencies, portable header. 4. Add test, confirm that we can run addmul with float dtypes (when we remove, the test fails). ghstack-source-id: 284862896 @exported-using-ghexport Differential Revision: [D74688554](https://our.internmc.facebook.com/intern/diff/D74688554/) --- examples/selective_build/targets.bzl | 15 ++ kernels/optimized/cpu/targets.bzl | 21 +- .../core/portable_type/c10/c10/targets.bzl | 2 +- shim_et/xplat/executorch/codegen/codegen.bzl | 196 ++++++++++++++---- .../optimized/op_registration_util.bzl | 8 + 5 files changed, 194 insertions(+), 48 deletions(-) diff --git a/examples/selective_build/targets.bzl b/examples/selective_build/targets.bzl index 276ee3afe41..685cf5068e4 100644 --- a/examples/selective_build/targets.bzl +++ b/examples/selective_build/targets.bzl @@ -69,6 +69,19 @@ def define_common_targets(): visibility = ["//executorch/..."], ) + executorch_generated_lib( + name = "select_ops_in_dict_lib_optimized", + functions_yaml_target = "//executorch/kernels/optimized:optimized.yaml", + kernel_deps = [ + "//executorch/kernels/optimized:optimized_operators", + ], + deps = [ + ":select_ops_in_dict", + ], + dtype_selective_build = True, + visibility = ["//executorch/..."], + ) + # Select all ops from a yaml file et_operator_library( name = "select_ops_from_yaml", @@ -121,6 +134,8 @@ def define_common_targets(): lib.append(":select_ops_in_list_lib") elif select_ops == "dict": lib.append(":select_ops_in_dict_lib") + elif select_ops == "dict_optimized": + lib.append(":select_ops_in_dict_lib_optimized") elif select_ops == "yaml": lib.append(":select_ops_from_yaml_lib") elif select_ops == "model": diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index 0d60d2c6bee..62c47c6256f 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -25,7 +25,7 @@ def define_common_targets(): name = "add_sub_impl", srcs = [], exported_headers = ["op_add_sub_impl.h"], - visibility = ["//executorch/kernels/optimized/cpu/..."], + visibility = ["//executorch/kernels/optimized/cpu/...", "@EXECUTORCH_CLIENTS",], exported_deps = [ "//executorch/runtime/core:core", "//executorch/kernels/portable/cpu/util:broadcast_indexes_range", @@ -36,14 +36,14 @@ def define_common_targets(): name = "fft_utils", srcs = [], exported_headers = ["fft_utils.h"], - visibility = ["//executorch/kernels/optimized/cpu/..."], + visibility = ["//executorch/kernels/optimized/cpu/...", "@EXECUTORCH_CLIENTS",], exported_deps = [] if runtime.is_oss else ["fbsource//third-party/pocket_fft:pocketfft"], ) runtime.cxx_library( name = "binary_ops", exported_headers = ["binary_ops.h"], - visibility = ["//executorch/kernels/optimized/cpu/..."], + visibility = ["//executorch/kernels/optimized/cpu/...", "@EXECUTORCH_CLIENTS",], exported_deps = ["//executorch/runtime/core:core"], ) @@ -58,9 +58,22 @@ def define_common_targets(): name = "moments_utils", srcs = [], exported_headers = ["moments_utils.h"], - visibility = ["//executorch/kernels/optimized/..."], + visibility = ["//executorch/kernels/optimized/...", "@EXECUTORCH_CLIENTS",], exported_deps = [ "//executorch/kernels/optimized:libvec", "//executorch/kernels/optimized:libutils", ], ) + + # Used for dtype selective build. Collect source and header files. + runtime.filegroup( + name = "optimized_source_files", + srcs = native.glob(["*.cpp"]), + visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"], + ) + + runtime.filegroup( + name = "optimized_header_files", + srcs = native.glob(["*.h"]), + visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"], + ) diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index 4555d42a567..176e4b8980b 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -53,7 +53,7 @@ def define_common_targets(): runtime.cxx_library( name = "aten_headers_for_executorch", srcs = [], - visibility = ["//executorch/kernels/optimized/..."], + visibility = ["//executorch/kernels/optimized/...", "@EXECUTORCH_CLIENTS"], exported_deps = select({ "DEFAULT": [], "ovr_config//cpu:arm64": [ diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl index e7bb7ecf9e0..f4996b6aaf7 100644 --- a/shim_et/xplat/executorch/codegen/codegen.bzl +++ b/shim_et/xplat/executorch/codegen/codegen.bzl @@ -1,6 +1,12 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_default_executorch_platforms", "is_xplat", "runtime", "struct_to_json") load("@fbsource//xplat/executorch/build:selects.bzl", "selects") load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_header_list", "portable_source_list") +load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_header_list", "optimized_source_list") +load( + "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl", + "get_vec_deps", + "get_vec_preprocessor_flags", +) # Headers that declare the function signatures of the C++ functions that # map to entries in functions.yaml and custom_ops.yaml. @@ -384,52 +390,60 @@ def exir_custom_ops_aot_lib( force_static = False, ) -# Used for dtype selective build. Genrules to copy source and header files. -def portable_outs(target_name, file_list): - outs = {} - for file in file_list: - outs[file] = ["{}/{}".format(target_name, file)] - return outs - -def copy_portable_source_files(name): - target_name = "portable_source_files" +def copy_files(genrule_name, target, file_list): + """ + Copy files from `target` to current directory. + genrule_name: name of this copy genrule. + target: a runtime.filegroup that globs together files. + eg. //executorch/kernels/portable/cpu:portable_source_files. + file_list: list of filenames, used to generate the outfiles. + eg. //executorch/kernels/portable/cpu:portable_source_list. + """ + target_name = target.split(":")[1] runtime.genrule( - name = name, - cmd = "cp -f -r $(location //executorch/kernels/portable/cpu:{}) $OUT/".format(target_name), - outs = portable_outs(target_name, portable_source_list()), + name = genrule_name, + cmd = "cp -f -r $(location {}) $OUT/".format(target), + outs = {file: ["{}/{}".format(target_name, file)] for file in file_list}, default_outs = ["."], ) -def copy_portable_header_files(name): - target_name = "portable_header_files" - runtime.genrule( +def build_portable_header_lib(name, oplist_header_name, feature = None): + """Build the portable headers into a header-only library. + Ensures that includes work across portable and optimized libs. + #include "executorch/kernels/portable/cpu/" + """ + # Copy portable header files. + portable_header_files = {} + genrule_name = name + "_copy_portable_header" + copy_files(genrule_name, "//executorch/kernels/portable/cpu:portable_header_files", portable_header_list()) + for header in portable_header_list(): + portable_header_files[header] = ":{}[{}]".format(genrule_name, header) + + # Include dtype header. + portable_header_files["selected_op_variants.h"] = ":{}[selected_op_variants]".format(oplist_header_name) + + # Build portable headers lib. + runtime.cxx_library( name = name, - cmd = "cp -f -r $(location //executorch/kernels/portable/cpu:{}) $OUT/".format(target_name), - outs = portable_outs(target_name, portable_header_list()), - default_outs = ["."], + srcs = [], + exported_headers = portable_header_files, + exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"], + # header_namespace is only available in xplat. See https://fburl.com/code/we2gvopk + header_namespace = "executorch/kernels/portable/cpu", + feature = feature, ) -def build_portable_lib(name, oplist_header_name, feature = None, expose_operator_symbols = False): +def build_portable_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False): """Build portable lib from source. We build from source so that the generated header file, selected_op_variants.h, can be used to selectively build the lib for different dtypes. """ # Copy portable cpp files. portable_source_files = [] - copy_portable_source_files_genrule = name + "_copy_portable_source" - copy_portable_source_files(copy_portable_source_files_genrule) + genrule_name = name + "_copy_portable_source" + copy_files(genrule_name, "//executorch/kernels/portable/cpu:portable_source_files", portable_source_list()) for op in portable_source_list(): - portable_source_files.append(":{}[{}]".format(copy_portable_source_files_genrule, op)) - - # Copy portable header files. - portable_header_files = {} - copy_portable_header_files_genrule = name + "_copy_portable_header" - copy_portable_header_files(copy_portable_header_files_genrule) - for header in portable_header_list(): - portable_header_files[header] = ":{}[{}]".format(copy_portable_header_files_genrule, header) - - # Include dtype header. - portable_header_files["selected_op_variants.h"] = ":{}[selected_op_variants]".format(oplist_header_name) + portable_source_files.append(":{}[{}]".format(genrule_name, op)) # For shared library build, we don't want to expose symbols of # kernel implementation (ex torch::executor::native::tanh_out) @@ -449,9 +463,8 @@ def build_portable_lib(name, oplist_header_name, feature = None, expose_operator runtime.cxx_library( name = name, srcs = portable_source_files, - exported_headers = portable_header_files, exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"], - deps = ["//executorch/kernels/portable/cpu/pattern:all_deps", "//executorch/kernels/portable/cpu/util:all_deps"], + deps = ["//executorch/kernels/portable/cpu/pattern:all_deps", "//executorch/kernels/portable/cpu/util:all_deps"] + [":" + portable_header_lib], # header_namespace is only available in xplat. See https://fburl.com/code/we2gvopk header_namespace = "executorch/kernels/portable/cpu", compiler_flags = compiler_flags, @@ -467,6 +480,88 @@ def build_portable_lib(name, oplist_header_name, feature = None, expose_operator feature = feature, ) +def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False): + """Build optimized lib from source. We build from source so that the generated header file, + selected_op_variants.h, can be used to selectively build the lib for different dtypes. + """ + + # Copy optimized cpp files. + optimized_source_files = [] + source_genrule = name + "_copy_optimized_source" + copy_files(source_genrule, "//executorch/kernels/optimized/cpu:optimized_source_files", optimized_source_list()) + for op in optimized_source_list(): + optimized_source_files.append(":{}[{}]".format(source_genrule, op)) + + # Copy optimized header files. + optimized_header_files = {} + header_genrule = name + "_copy_optimized_header" + copy_files(header_genrule, "//executorch/kernels/optimized/cpu:optimized_header_files", optimized_header_list()) + for header in optimized_header_list(): + optimized_header_files[header] = ":{}[{}]".format(header_genrule, header) + + # For shared library build, we don't want to expose symbols of + # kernel implementation (ex torch::executor::native::tanh_out) + # to library users. They should use kernels through registry only. + # With visibility=hidden, linker won't expose kernel impl symbols + # so it can prune unregistered kernels. + # Currently fbcode links all dependent libraries through shared + # library, and it blocks users like unit tests to use kernel + # implementation directly. So we enable this for xplat only. + compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed","-Wno-global-constructors","-Wno-shadow",] + if not expose_operator_symbols: + # Removing '-fvisibility=hidden' exposes operator symbols. + # This allows operators to be called outside of the kernel registry. + compiler_flags += ["-fvisibility=hidden"] + + # Set up dependencies. + optimized_lib_deps = [ + "//executorch/kernels/optimized/cpu:add_sub_impl", + "//executorch/kernels/optimized/cpu:binary_ops", + "//executorch/kernels/optimized/cpu:fft_utils", + "//executorch/kernels/optimized/cpu:moments_utils", + "//executorch/kernels/optimized:libblas", + "//executorch/kernels/optimized:libutils", + "//executorch/kernels/optimized:libvec", + "//executorch/kernels/portable/cpu/pattern:all_deps", + "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch", + "//executorch/runtime/kernel:kernel_includes", + ":" + portable_header_lib, + ] + get_vec_deps() + + # Build optimized lib. + runtime.cxx_library( + name = name, + srcs = optimized_source_files, + exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"], + deps = optimized_lib_deps, + # header_namespace is only available in xplat. See https://fburl.com/code/we2gvopk + header_namespace = "executorch/kernels/optimized/cpu", + compiler_flags = compiler_flags, + preprocessor_flags = get_vec_preprocessor_flags(), + # sleef needs to be added as a direct dependency of the operator target when building for Android, + # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of + # dependencies are not transitive + fbandroid_platform_deps = [ + ( + "^android-arm64.*$", + [ + "fbsource//third-party/sleef:sleef_arm", + ], + ), + ], + # WARNING: using a deprecated API to avoid being built into a shared + # library. In the case of dynamically loading so library we don't want + # it to depend on other so libraries because that way we have to + # specify library directory path. + force_static = True, + # link_whole is necessary because the operators register themselves + # via static initializers that run at program startup. + # @lint-ignore BUCKLINT link_whole + link_whole = True, + feature = feature, + ) + def executorch_generated_lib( name, functions_yaml_target = None, @@ -629,14 +724,29 @@ def executorch_generated_lib( ) portable_lib = [] - if dtype_selective_build and is_xplat() and "//executorch/kernels/portable:operators" in kernel_deps: - # Remove portable from kernel_deps as we're building it from source. - kernel_deps.remove("//executorch/kernels/portable:operators") - - # Build portable lib. - portable_lib_name = name + "_portable_lib" - build_portable_lib(portable_lib_name, oplist_header_name, feature, expose_operator_symbols) - portable_lib = [":{}".format(portable_lib_name)] + optimized_lib = [] + if dtype_selective_build and is_xplat(): + # Build portable headers lib. Used for portable and optimized kernel libraries. + portable_header_lib = name + "_portable_header_lib" + build_portable_header_lib(portable_header_lib, oplist_header_name, feature) + + if "//executorch/kernels/portable:operators" in kernel_deps: + # Remove portable from kernel_deps as we're building it from source. + kernel_deps.remove("//executorch/kernels/portable:operators") + + # Build portable lib. + portable_lib_name = name + "_portable_lib" + build_portable_lib(portable_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols) + portable_lib = [":{}".format(portable_lib_name)] + + if "//executorch/kernels/optimized:optimized_operators" in kernel_deps: + # Remove optimized from kernel_deps as we're building it from source. + kernel_deps.remove("//executorch/kernels/optimized:optimized_operators") + + # Build optimized lib. + optimized_lib_name = name + "_optimized_lib" + build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols) + optimized_lib = [":{}".format(optimized_lib_name)] # Exports headers that declare the function signatures of the C++ functions # that map to entries in `functions.yaml` and `custom_ops.yaml`. @@ -690,7 +800,7 @@ def executorch_generated_lib( "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix, "//executorch/runtime/core:evalue" + aten_suffix, "//executorch/codegen:macros", - ] + deps + kernel_deps + portable_lib, + ] + deps + kernel_deps + portable_lib + optimized_lib, exported_deps = [ "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix, diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl index 5121dbad599..dc46e4dd77e 100644 --- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl @@ -265,3 +265,11 @@ OPTIMIZED_ATEN_OPS = ( ], ), ) + +def optimized_source_list(): + """All the source file names from //executorch/kernels/optimized/cpu""" + return [op["name"] + ".cpp" for op in OPTIMIZED_ATEN_OPS] + +def optimized_header_list(): + """All the header file names from //executorch/kernels/optimized/cpu""" + return ["binary_ops.h", "fft_utils.h", "moments_utils.h", "op_add_sub_impl.h",]