Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,18 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
executorch_move_interface_include_directories_to_build_time_only(
pthreadpool_interface
)

if(APPLE)
# Use hidden visibility for pthreadpool on Apple platforms to avoid issues
# with pthreadpool symbols from libtorch_cpu taking precedence over the ones
# from the pthreadpool library statically linked in _portable_lib. The
# pthreadpool public APIs are marked as weak by default on some Apple
# platforms, so setting to hidden visibility works around this by not
# putting the symbol in the indirection table. See
# https://github.com/pytorch/executorch/issues/14321 for more details.
target_compile_options(pthreadpool PRIVATE -fvisibility=hidden)
endif()

install(
TARGETS pthreadpool pthreadpool_interface fxdiv
EXPORT ExecuTorchTargets
Expand Down
14 changes: 13 additions & 1 deletion extension/threadpool/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@ if(NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif()

# Threadpool size specifiers. Mutual exclusion is checking in default.cmake.
# Default to using performance cores if
# EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES isn't set.
set(_threadpool_size_flag)
if(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES")
else()
set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES")
endif()

add_library(
extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
cpuinfo_utils.cpp
Expand All @@ -36,7 +46,9 @@ target_include_directories(
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include>
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include>
)
target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL)
target_compile_definitions(
extension_threadpool PUBLIC ET_USE_THREADPOOL ${_threadpool_size_flag}
)
target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})

# Install libraries
Expand Down
1 change: 1 addition & 0 deletions extension/threadpool/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def define_common_targets():
name = "threadpool_lib",
srcs = _THREADPOOL_SRCS,
deps = [
":cpuinfo_utils",
"//executorch/runtime/core:core",
"//executorch/runtime/core/portable_type/c10/c10:c10",
],
Expand Down
9 changes: 9 additions & 0 deletions extension/threadpool/test/threadpool_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/

#include <executorch/extension/threadpool/threadpool.h>
#include <executorch/runtime/platform/runtime.h>

#include <mutex>
#include <numeric>
Expand Down Expand Up @@ -71,6 +72,8 @@ void run_lambda_with_size(
} // namespace

TEST(ThreadPoolTest, ParallelAdd) {
executorch::runtime::runtime_init();

std::vector<int32_t> a, b, c, c_ref;
size_t vector_size = 100;
size_t grain_size = 10;
Expand Down Expand Up @@ -111,6 +114,8 @@ TEST(ThreadPoolTest, ParallelAdd) {

// Test parallel reduction where we acquire lock within lambda
TEST(ThreadPoolTest, ParallelReduce) {
executorch::runtime::runtime_init();

std::vector<int32_t> a;
int32_t c = 0, c_ref = 0;
size_t vector_size = 100;
Expand Down Expand Up @@ -144,6 +149,8 @@ TEST(ThreadPoolTest, ParallelReduce) {
// Copied from
// caffe2/aten/src/ATen/test/test_thread_pool_guard.cp
TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
executorch::runtime::runtime_init();

auto threadpool_ptr = ::executorch::extension::threadpool::get_pthreadpool();

ASSERT_NE(threadpool_ptr, nullptr);
Expand Down Expand Up @@ -173,6 +180,8 @@ TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
}

TEST(TestNoThreadPoolGuard, TestRunWithGuard) {
executorch::runtime::runtime_init();

const std::vector<int64_t> array = {1, 2, 3};

auto pool = ::executorch::extension::threadpool::get_threadpool();
Expand Down
31 changes: 29 additions & 2 deletions extension/threadpool/threadpool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,34 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/extension/threadpool/cpuinfo_utils.h>
#include <executorch/extension/threadpool/threadpool.h>

#include <algorithm>
#include <memory>

#include <executorch/extension/threadpool/threadpool_guard.h>
#include <executorch/runtime/platform/assert.h>
#include <executorch/runtime/platform/runtime.h>

#include <cpuinfo.h>

// At most one mode should be set.
#if ( \
defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES))
#error Multiple \
threadpool size specifiers are set.At most one of \
EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES, \
and EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES may be defined.
#endif

// Default to EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES if no mode is set.
#if !defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
!defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES)
#define EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES 1
#endif

namespace executorch::extension::threadpool {

#if !(defined(WIN32))
Expand Down Expand Up @@ -95,13 +113,21 @@ void ThreadPool::run(
// get_threadpool is not thread safe due to leak_corrupted_threadpool
// Make this part threadsafe: TODO(kimishpatel)
ThreadPool* get_threadpool() {
executorch::runtime::runtime_init();

if (!cpuinfo_initialize()) {
ET_LOG(Error, "cpuinfo initialization failed");
return nullptr; // NOLINT(facebook-hte-NullableReturn)
}

static const int num_threads = ([]() {
int result = cpuinfo_get_processors_count();
#if defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
// Use threads=cores.
auto result = cpuinfo_get_processors_count();
#else
// Set threads equal to the number of performance cores.
auto result = ::executorch::extension::cpuinfo::get_num_performant_cores();
#endif

/*
* For llvm-tsan, holding limit for the number of locks for a single thread
Expand All @@ -111,9 +137,10 @@ ThreadPool* get_threadpool() {
* tricky to detect if we are running under tsan, for now capping the
* default threadcount to the tsan limit unconditionally.
*/
constexpr int tsan_thread_limit = 63;
constexpr unsigned int tsan_thread_limit = 63;
return std::min(result, tsan_thread_limit);
})();

static auto threadpool = std::make_unique<ThreadPool>(num_threads);

// Inheriting from old threadpool to get around segfault issue
Expand Down
16 changes: 16 additions & 0 deletions extension/threadpool/threadpool.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,22 @@

#include <pthreadpool.h>

/*
* Threadpool Options:
*
* Threadpool size has a sizble affect on performance. By default, the
* threadpool will be sized according to the number of performance cores. This
* behavior can be overriden with the following build-time options. Note that
* these options are mutually exclusive.
*
* - EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES (flag) - Sizes the threadpool
* equal to the number of performance cores on the system. This is the default
* behavior.
* - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES (flag) - Sizes the threadpool
* equal to the number of logical cores on system. This is the historical
* behavior.
*/

namespace executorch::extension::threadpool {

class ThreadPool final {
Expand Down
30 changes: 30 additions & 0 deletions tools/cmake/preset/default.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,36 @@ define_overridable_option(
${_default_executorch_build_cpuinfo}
)

# Threadpool size options. At most one can be specified. Note that the default
# is managed in threadpool.cpp to allow the user to specify an alternate mode
# without needing to explicitly set the default to off.
define_overridable_option(
EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES
"Set the number of threads used for CPU parallel computation equal to the number of performant CPU cores."
BOOL
OFF
)
define_overridable_option(
EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
"Set the number of threads used for CPU parallel computation equal to the number of logical CPU cores."
BOOL
OFF
)

check_required_options_on(
IF_ON EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES REQUIRES
EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
)
check_required_options_on(
IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES REQUIRES
EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
)

check_conflicting_options_on(
IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES CONFLICTS_WITH
EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
)

# TODO(jathu): move this to platform specific presets when created
set(_default_executorch_build_executor_runner ON)
if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")
Expand Down
Loading