diff --git a/CMakeLists.txt b/CMakeLists.txt index 7012ec641bf..ca5503a357b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -266,6 +266,18 @@ if(EXECUTORCH_BUILD_PTHREADPOOL) executorch_move_interface_include_directories_to_build_time_only( pthreadpool_interface ) + + if(APPLE) + # Use hidden visibility for pthreadpool on Apple platforms to avoid issues + # with pthreadpool symbols from libtorch_cpu taking precedence over the ones + # from the pthreadpool library statically linked in _portable_lib. The + # pthreadpool public APIs are marked as weak by default on some Apple + # platforms, so setting to hidden visibility works around this by not + # putting the symbol in the indirection table. See + # https://github.com/pytorch/executorch/issues/14321 for more details. + target_compile_options(pthreadpool PRIVATE -fvisibility=hidden) + endif() + install( TARGETS pthreadpool pthreadpool_interface fxdiv EXPORT ExecuTorchTargets diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt index a6c06e84293..5bb647d3a09 100644 --- a/extension/threadpool/CMakeLists.txt +++ b/extension/threadpool/CMakeLists.txt @@ -20,6 +20,16 @@ if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() +# Threadpool size specifiers. Mutual exclusion is checking in default.cmake. +# Default to using performance cores if +# EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES isn't set. +set(_threadpool_size_flag) +if(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) + set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES") +else() + set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES") +endif() + add_library( extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp cpuinfo_utils.cpp @@ -36,7 +46,9 @@ target_include_directories( $ $ ) -target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL) +target_compile_definitions( + extension_threadpool PUBLIC ET_USE_THREADPOOL ${_threadpool_size_flag} +) target_compile_options(extension_threadpool PUBLIC ${_common_compile_options}) # Install libraries diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl index 6ef55c42434..1889cb650ad 100644 --- a/extension/threadpool/targets.bzl +++ b/extension/threadpool/targets.bzl @@ -22,6 +22,7 @@ def define_common_targets(): name = "threadpool_lib", srcs = _THREADPOOL_SRCS, deps = [ + ":cpuinfo_utils", "//executorch/runtime/core:core", "//executorch/runtime/core/portable_type/c10/c10:c10", ], diff --git a/extension/threadpool/test/threadpool_test.cpp b/extension/threadpool/test/threadpool_test.cpp index e7784d3cc11..052e6c22f5e 100644 --- a/extension/threadpool/test/threadpool_test.cpp +++ b/extension/threadpool/test/threadpool_test.cpp @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -71,6 +72,8 @@ void run_lambda_with_size( } // namespace TEST(ThreadPoolTest, ParallelAdd) { + executorch::runtime::runtime_init(); + std::vector a, b, c, c_ref; size_t vector_size = 100; size_t grain_size = 10; @@ -111,6 +114,8 @@ TEST(ThreadPoolTest, ParallelAdd) { // Test parallel reduction where we acquire lock within lambda TEST(ThreadPoolTest, ParallelReduce) { + executorch::runtime::runtime_init(); + std::vector a; int32_t c = 0, c_ref = 0; size_t vector_size = 100; @@ -144,6 +149,8 @@ TEST(ThreadPoolTest, ParallelReduce) { // Copied from // caffe2/aten/src/ATen/test/test_thread_pool_guard.cp TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) { + executorch::runtime::runtime_init(); + auto threadpool_ptr = ::executorch::extension::threadpool::get_pthreadpool(); ASSERT_NE(threadpool_ptr, nullptr); @@ -173,6 +180,8 @@ TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) { } TEST(TestNoThreadPoolGuard, TestRunWithGuard) { + executorch::runtime::runtime_init(); + const std::vector array = {1, 2, 3}; auto pool = ::executorch::extension::threadpool::get_threadpool(); diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp index e9f3b0f5f4a..406489005da 100644 --- a/extension/threadpool/threadpool.cpp +++ b/extension/threadpool/threadpool.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include @@ -13,9 +14,26 @@ #include #include +#include #include +// At most one mode should be set. +#if ( \ + defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \ + defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES)) +#error Multiple \ + threadpool size specifiers are set.At most one of \ + EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES, \ + and EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES may be defined. +#endif + +// Default to EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES if no mode is set. +#if !defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \ + !defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES) +#define EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES 1 +#endif + namespace executorch::extension::threadpool { #if !(defined(WIN32)) @@ -95,13 +113,21 @@ void ThreadPool::run( // get_threadpool is not thread safe due to leak_corrupted_threadpool // Make this part threadsafe: TODO(kimishpatel) ThreadPool* get_threadpool() { + executorch::runtime::runtime_init(); + if (!cpuinfo_initialize()) { ET_LOG(Error, "cpuinfo initialization failed"); return nullptr; // NOLINT(facebook-hte-NullableReturn) } static const int num_threads = ([]() { - int result = cpuinfo_get_processors_count(); +#if defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) + // Use threads=cores. + auto result = cpuinfo_get_processors_count(); +#else + // Set threads equal to the number of performance cores. + auto result = ::executorch::extension::cpuinfo::get_num_performant_cores(); +#endif /* * For llvm-tsan, holding limit for the number of locks for a single thread @@ -111,9 +137,10 @@ ThreadPool* get_threadpool() { * tricky to detect if we are running under tsan, for now capping the * default threadcount to the tsan limit unconditionally. */ - constexpr int tsan_thread_limit = 63; + constexpr unsigned int tsan_thread_limit = 63; return std::min(result, tsan_thread_limit); })(); + static auto threadpool = std::make_unique(num_threads); // Inheriting from old threadpool to get around segfault issue diff --git a/extension/threadpool/threadpool.h b/extension/threadpool/threadpool.h index 3ad2d1d48d4..16acad6e5fa 100644 --- a/extension/threadpool/threadpool.h +++ b/extension/threadpool/threadpool.h @@ -14,6 +14,22 @@ #include +/* + * Threadpool Options: + * + * Threadpool size has a sizble affect on performance. By default, the + * threadpool will be sized according to the number of performance cores. This + * behavior can be overriden with the following build-time options. Note that + * these options are mutually exclusive. + * + * - EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES (flag) - Sizes the threadpool + * equal to the number of performance cores on the system. This is the default + * behavior. + * - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES (flag) - Sizes the threadpool + * equal to the number of logical cores on system. This is the historical + * behavior. + */ + namespace executorch::extension::threadpool { class ThreadPool final { diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index fb0dc0a4ade..16f4245f6bc 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -176,6 +176,36 @@ define_overridable_option( ${_default_executorch_build_cpuinfo} ) +# Threadpool size options. At most one can be specified. Note that the default +# is managed in threadpool.cpp to allow the user to specify an alternate mode +# without needing to explicitly set the default to off. +define_overridable_option( + EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES + "Set the number of threads used for CPU parallel computation equal to the number of performant CPU cores." + BOOL + OFF +) +define_overridable_option( + EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES + "Set the number of threads used for CPU parallel computation equal to the number of logical CPU cores." + BOOL + OFF +) + +check_required_options_on( + IF_ON EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES REQUIRES + EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO +) +check_required_options_on( + IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES REQUIRES + EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO +) + +check_conflicting_options_on( + IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES CONFLICTS_WITH + EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES +) + # TODO(jathu): move this to platform specific presets when created set(_default_executorch_build_executor_runner ON) if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")