Skip to content

Commit 9d64275

Browse files
committed
[OpenMP] Added the support for hidden helper task in RTL
The basic design is to create an outer-most parallel team. It is not a regular team because it is only created when the first hidden helper task is encountered, and is only responsible for the execution of hidden helper tasks. We first use `pthread_create` to create a new thread, let's call it the initial and also the main thread of the hidden helper team. This initial thread then initializes a new root, just like what RTL does in initialization. After that, it directly calls `__kmpc_fork_call`. It is like the initial thread encounters a parallel region. The wrapped function for this team is, for main thread, which is the initial thread that we create via `pthread_create` on Linux, waits on a condition variable. The condition variable can only be signaled when RTL is being destroyed. For other work threads, they just do nothing. The reason that main thread needs to wait there is, in current implementation, once the main thread finishes the wrapped function of this team, it starts to free the team which is not what we want. Two environment variables, `LIBOMP_NUM_HIDDEN_HELPER_THREADS` and `LIBOMP_USE_HIDDEN_HELPER_TASK`, are also set to configure the number of threads and enable/disable this feature. By default, the number of hidden helper threads is 8. Here are some open issues to be discussed: 1. The main thread goes to sleeping when the initialization is finished. As Andrey mentioned, we might need it to be awaken from time to time to do some stuffs. What kind of update/check should be put here? Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D77609
1 parent 357eea6 commit 9d64275

File tree

15 files changed

+1023
-46
lines changed

15 files changed

+1023
-46
lines changed

openmp/runtime/src/kmp.h

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2334,7 +2334,8 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
23342334
unsigned priority_specified : 1; /* set if the compiler provides priority
23352335
setting for the task */
23362336
unsigned detachable : 1; /* 1 == can detach */
2337-
unsigned reserved : 9; /* reserved for compiler use */
2337+
unsigned hidden_helper : 1; /* 1 == hidden helper task */
2338+
unsigned reserved : 8; /* reserved for compiler use */
23382339

23392340
/* Library flags */ /* Total library flags must be 16 bits */
23402341
unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
@@ -2382,6 +2383,13 @@ struct kmp_taskdata { /* aligned during dynamic allocation */
23822383
kmp_depnode_t
23832384
*td_depnode; // Pointer to graph node if this task has dependencies
23842385
kmp_task_team_t *td_task_team;
2386+
// The global thread id of the encountering thread. We need it because when a
2387+
// regular task depends on a hidden helper task, and the hidden helper task
2388+
// is finished on a hidden helper thread, it will call __kmp_release_deps to
2389+
// release all dependences. If now the task is a regular task, we need to pass
2390+
// the encountering gtid such that the task will be picked up and executed by
2391+
// its encountering team instead of hidden helper team.
2392+
kmp_int32 encountering_gtid;
23852393
size_t td_size_alloc; // Size of task structure, including shareds etc.
23862394
#if defined(KMP_GOMP_COMPAT)
23872395
// 4 or 8 byte integers for the loop bounds in GOMP_taskloop
@@ -2449,6 +2457,9 @@ typedef struct kmp_base_task_team {
24492457
kmp_int32 tt_max_threads; // # entries allocated for threads_data array
24502458
kmp_int32 tt_found_proxy_tasks; // found proxy tasks since last barrier
24512459
kmp_int32 tt_untied_task_encountered;
2460+
// There is hidden helper thread encountered in this task team so that we must
2461+
// wait when waiting on task team
2462+
kmp_int32 tt_hidden_helper_task_encountered;
24522463

24532464
KMP_ALIGN_CACHE
24542465
std::atomic<kmp_int32> tt_unfinished_threads; /* #threads still active */
@@ -2917,6 +2928,7 @@ extern volatile int __kmp_init_parallel;
29172928
extern volatile int __kmp_init_monitor;
29182929
#endif
29192930
extern volatile int __kmp_init_user_locks;
2931+
extern volatile int __kmp_init_hidden_helper_threads;
29202932
extern int __kmp_init_counter;
29212933
extern int __kmp_root_counter;
29222934
extern int __kmp_version;
@@ -3985,6 +3997,45 @@ static inline void __kmp_resume_if_hard_paused() {
39853997

39863998
extern void __kmp_omp_display_env(int verbose);
39873999

4000+
// 1: it is initializing hidden helper team
4001+
extern volatile int __kmp_init_hidden_helper;
4002+
// 1: the hidden helper team is done
4003+
extern volatile int __kmp_hidden_helper_team_done;
4004+
// 1: enable hidden helper task
4005+
extern kmp_int32 __kmp_enable_hidden_helper;
4006+
// Main thread of hidden helper team
4007+
extern kmp_info_t *__kmp_hidden_helper_main_thread;
4008+
// Descriptors for the hidden helper threads
4009+
extern kmp_info_t **__kmp_hidden_helper_threads;
4010+
// Number of hidden helper threads
4011+
extern kmp_int32 __kmp_hidden_helper_threads_num;
4012+
// Number of hidden helper tasks that have not been executed yet
4013+
extern std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
4014+
4015+
extern void __kmp_hidden_helper_initialize();
4016+
extern void __kmp_hidden_helper_threads_initz_routine();
4017+
extern void __kmp_do_initialize_hidden_helper_threads();
4018+
extern void __kmp_hidden_helper_threads_initz_wait();
4019+
extern void __kmp_hidden_helper_initz_release();
4020+
extern void __kmp_hidden_helper_threads_deinitz_wait();
4021+
extern void __kmp_hidden_helper_threads_deinitz_release();
4022+
extern void __kmp_hidden_helper_main_thread_wait();
4023+
extern void __kmp_hidden_helper_worker_thread_wait();
4024+
extern void __kmp_hidden_helper_worker_thread_signal();
4025+
extern void __kmp_hidden_helper_main_thread_release();
4026+
4027+
// Check whether a given thread is a hidden helper thread
4028+
#define KMP_HIDDEN_HELPER_THREAD(gtid) \
4029+
((gtid) >= 1 && (gtid) <= __kmp_hidden_helper_threads_num)
4030+
4031+
#define KMP_HIDDEN_HELPER_WORKER_THREAD(gtid) \
4032+
((gtid) > 1 && (gtid) <= __kmp_hidden_helper_threads_num)
4033+
4034+
// Map a gtid to a hidden helper thread. The first hidden helper thread, a.k.a
4035+
// main thread, is skipped.
4036+
#define KMP_GTID_TO_SHADOW_GTID(gtid) \
4037+
((gtid) % (__kmp_hidden_helper_threads_num - 1) + 2)
4038+
39884039
#ifdef __cplusplus
39894040
}
39904041
#endif

openmp/runtime/src/kmp_global.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ volatile int __kmp_init_gtid = FALSE;
4646
volatile int __kmp_init_common = FALSE;
4747
volatile int __kmp_init_middle = FALSE;
4848
volatile int __kmp_init_parallel = FALSE;
49+
volatile int __kmp_init_hidden_helper = FALSE;
50+
volatile int __kmp_init_hidden_helper_threads = FALSE;
51+
volatile int __kmp_hidden_helper_team_done = FALSE;
4952
#if KMP_USE_MONITOR
5053
volatile int __kmp_init_monitor =
5154
0; /* 1 - launched, 2 - actually started (Windows* OS only) */

openmp/runtime/src/kmp_runtime.cpp

Lines changed: 145 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3644,15 +3644,37 @@ int __kmp_register_root(int initial_thread) {
36443644
}
36453645
}
36463646

3647-
/* find an available thread slot */
3648-
/* Don't reassign the zero slot since we need that to only be used by initial
3649-
thread */
3650-
for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3651-
gtid++)
3652-
;
3653-
KA_TRACE(1,
3654-
("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3655-
KMP_ASSERT(gtid < __kmp_threads_capacity);
3647+
// When hidden helper task is enabled, __kmp_threads is organized as follows:
3648+
// 0: initial thread, also a regular OpenMP thread.
3649+
// [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3650+
// [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3651+
// regular OpenMP threads.
3652+
if (TCR_4(__kmp_init_hidden_helper_threads)) {
3653+
// Find an available thread slot for hidden helper thread. Slots for hidden
3654+
// helper threads start from 1 to __kmp_hidden_helper_threads_num.
3655+
for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3656+
gtid <= __kmp_hidden_helper_threads_num;
3657+
gtid++)
3658+
;
3659+
KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3660+
KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3661+
"hidden helper thread: T#%d\n",
3662+
gtid));
3663+
} else {
3664+
/* find an available thread slot */
3665+
// Don't reassign the zero slot since we need that to only be used by
3666+
// initial thread. Slots for hidden helper threads should also be skipped.
3667+
if (initial_thread && __kmp_threads[0] == NULL) {
3668+
gtid = 0;
3669+
} else {
3670+
for (gtid = __kmp_hidden_helper_threads_num + 1;
3671+
TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3672+
;
3673+
}
3674+
KA_TRACE(
3675+
1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3676+
KMP_ASSERT(gtid < __kmp_threads_capacity);
3677+
}
36563678

36573679
/* update global accounting */
36583680
__kmp_all_nth++;
@@ -4303,8 +4325,20 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
43034325
#endif
43044326

43054327
KMP_MB();
4306-
for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4307-
KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4328+
4329+
{
4330+
int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4331+
? 1
4332+
: __kmp_hidden_helper_threads_num + 1;
4333+
4334+
for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4335+
++new_gtid) {
4336+
KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4337+
}
4338+
4339+
if (TCR_4(__kmp_init_hidden_helper_threads)) {
4340+
KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4341+
}
43084342
}
43094343

43104344
/* allocate space for it. */
@@ -6249,6 +6283,15 @@ void __kmp_internal_end_thread(int gtid_req) {
62496283
return;
62506284
}
62516285

6286+
// If hidden helper team has been initialized, we need to deinit it
6287+
if (TCR_4(__kmp_init_hidden_helper)) {
6288+
TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6289+
// First release the main thread to let it continue its work
6290+
__kmp_hidden_helper_main_thread_release();
6291+
// Wait until the hidden helper team has been destroyed
6292+
__kmp_hidden_helper_threads_deinitz_wait();
6293+
}
6294+
62526295
KMP_MB(); /* Flush all pending memory write invalidates. */
62536296

62546297
/* find out who we are and what we should do */
@@ -7125,6 +7168,41 @@ void __kmp_parallel_initialize(void) {
71257168
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
71267169
}
71277170

7171+
void __kmp_hidden_helper_initialize() {
7172+
if (TCR_4(__kmp_init_hidden_helper))
7173+
return;
7174+
7175+
// __kmp_parallel_initialize is required before we initialize hidden helper
7176+
if (!TCR_4(__kmp_init_parallel))
7177+
__kmp_parallel_initialize();
7178+
7179+
// Double check. Note that this double check should not be placed before
7180+
// __kmp_parallel_initialize as it will cause dead lock.
7181+
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7182+
if (TCR_4(__kmp_init_hidden_helper)) {
7183+
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
7184+
return;
7185+
}
7186+
7187+
// Set the count of hidden helper tasks to be executed to zero
7188+
KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7189+
7190+
// Set the global variable indicating that we're initializing hidden helper
7191+
// team/threads
7192+
TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7193+
7194+
// Platform independent initialization
7195+
__kmp_do_initialize_hidden_helper_threads();
7196+
7197+
// Wait here for the finish of initialization of hidden helper teams
7198+
__kmp_hidden_helper_threads_initz_wait();
7199+
7200+
// We have finished hidden helper initialization
7201+
TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7202+
7203+
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
7204+
}
7205+
71287206
/* ------------------------------------------------------------------------ */
71297207

71307208
void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
@@ -8470,11 +8548,66 @@ int __kmp_pause_resource(kmp_pause_status_t level) {
84708548
}
84718549
}
84728550

8473-
84748551
void __kmp_omp_display_env(int verbose) {
84758552
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
84768553
if (__kmp_init_serial == 0)
84778554
__kmp_do_serial_initialize();
84788555
__kmp_display_env_impl(!verbose, verbose);
84798556
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
84808557
}
8558+
8559+
// Globals and functions for hidden helper task
8560+
kmp_info_t **__kmp_hidden_helper_threads;
8561+
kmp_info_t *__kmp_hidden_helper_main_thread;
8562+
kmp_int32 __kmp_hidden_helper_threads_num = 8;
8563+
std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8564+
#if KMP_OS_LINUX
8565+
kmp_int32 __kmp_enable_hidden_helper = TRUE;
8566+
#else
8567+
kmp_int32 __kmp_enable_hidden_helper = FALSE;
8568+
#endif
8569+
8570+
namespace {
8571+
std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8572+
8573+
void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8574+
// This is an explicit synchronization on all hidden helper threads in case
8575+
// that when a regular thread pushes a hidden helper task to one hidden
8576+
// helper thread, the thread has not been awaken once since they're released
8577+
// by the main thread after creating the team.
8578+
KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8579+
while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8580+
__kmp_hidden_helper_threads_num)
8581+
;
8582+
8583+
// If main thread, then wait for signal
8584+
if (__kmpc_master(nullptr, *gtid)) {
8585+
// First, unset the initial state and release the initial thread
8586+
TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8587+
__kmp_hidden_helper_initz_release();
8588+
__kmp_hidden_helper_main_thread_wait();
8589+
// Now wake up all worker threads
8590+
for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8591+
__kmp_hidden_helper_worker_thread_signal();
8592+
}
8593+
}
8594+
}
8595+
} // namespace
8596+
8597+
void __kmp_hidden_helper_threads_initz_routine() {
8598+
// Create a new root for hidden helper team/threads
8599+
const int gtid = __kmp_register_root(TRUE);
8600+
__kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8601+
__kmp_hidden_helper_threads = &__kmp_threads[gtid];
8602+
__kmp_hidden_helper_main_thread->th.th_set_nproc =
8603+
__kmp_hidden_helper_threads_num;
8604+
8605+
KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8606+
8607+
__kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8608+
8609+
// Set the initialization flag to FALSE
8610+
TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8611+
8612+
__kmp_hidden_helper_threads_deinitz_release();
8613+
}

openmp/runtime/src/kmp_settings.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,11 @@ int __kmp_initial_threads_capacity(int req_nproc) {
503503
if (nth < (4 * __kmp_xproc))
504504
nth = (4 * __kmp_xproc);
505505

506+
// If hidden helper task is enabled, we initialize the thread capacity with
507+
// extra
508+
// __kmp_hidden_helper_threads_num.
509+
nth += __kmp_hidden_helper_threads_num;
510+
506511
if (nth > __kmp_max_nth)
507512
nth = __kmp_max_nth;
508513

@@ -1161,6 +1166,39 @@ static void __kmp_stg_parse_num_threads(char const *name, char const *value,
11611166
K_DIAG(1, ("__kmp_dflt_team_nth == %d\n", __kmp_dflt_team_nth));
11621167
} // __kmp_stg_parse_num_threads
11631168

1169+
static void __kmp_stg_parse_num_hidden_helper_threads(char const *name,
1170+
char const *value,
1171+
void *data) {
1172+
__kmp_stg_parse_int(name, value, 0, 16, &__kmp_hidden_helper_threads_num);
1173+
// If the number of hidden helper threads is zero, we disable hidden helper
1174+
// task
1175+
if (__kmp_hidden_helper_threads_num == 0) {
1176+
__kmp_enable_hidden_helper = FALSE;
1177+
}
1178+
} // __kmp_stg_parse_num_hidden_helper_threads
1179+
1180+
static void __kmp_stg_print_num_hidden_helper_threads(kmp_str_buf_t *buffer,
1181+
char const *name,
1182+
void *data) {
1183+
__kmp_stg_print_int(buffer, name, __kmp_hidden_helper_threads_num);
1184+
} // __kmp_stg_print_num_hidden_helper_threads
1185+
1186+
static void __kmp_stg_parse_use_hidden_helper(char const *name,
1187+
char const *value, void *data) {
1188+
__kmp_stg_parse_bool(name, value, &__kmp_enable_hidden_helper);
1189+
#if !KMP_OS_LINUX
1190+
__kmp_enable_hidden_helper = FALSE;
1191+
K_DIAG(1,
1192+
("__kmp_stg_parse_use_hidden_helper: Disable hidden helper task on "
1193+
"non-Linux platform although it is enabled by user explicitly.\n"));
1194+
#endif
1195+
} // __kmp_stg_parse_use_hidden_helper
1196+
1197+
static void __kmp_stg_print_use_hidden_helper(kmp_str_buf_t *buffer,
1198+
char const *name, void *data) {
1199+
__kmp_stg_print_bool(buffer, name, __kmp_enable_hidden_helper);
1200+
} // __kmp_stg_print_use_hidden_helper
1201+
11641202
static void __kmp_stg_print_num_threads(kmp_str_buf_t *buffer, char const *name,
11651203
void *data) {
11661204
if (__kmp_env_format) {
@@ -4954,6 +4992,11 @@ static kmp_setting_t __kmp_stg_table[] = {
49544992
__kmp_stg_print_omp_cancellation, NULL, 0, 0},
49554993
{"OMP_ALLOCATOR", __kmp_stg_parse_allocator, __kmp_stg_print_allocator,
49564994
NULL, 0, 0},
4995+
{"LIBOMP_USE_HIDDEN_HELPER_TASK", __kmp_stg_parse_use_hidden_helper,
4996+
__kmp_stg_print_use_hidden_helper, NULL, 0, 0},
4997+
{"LIBOMP_NUM_HIDDEN_HELPER_THREADS",
4998+
__kmp_stg_parse_num_hidden_helper_threads,
4999+
__kmp_stg_print_num_hidden_helper_threads, NULL, 0, 0},
49575000

49585001
#if OMPT_SUPPORT
49595002
{"OMP_TOOL", __kmp_stg_parse_omp_tool, __kmp_stg_print_omp_tool, NULL, 0,

openmp/runtime/src/kmp_taskdeps.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,9 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
588588
current_task->td_flags.tasking_ser ||
589589
current_task->td_flags.final;
590590
kmp_task_team_t *task_team = thread->th.th_task_team;
591-
serial = serial && !(task_team && task_team->tt.tt_found_proxy_tasks);
591+
serial = serial &&
592+
!(task_team && (task_team->tt.tt_found_proxy_tasks ||
593+
task_team->tt.tt_hidden_helper_task_encountered));
592594

593595
if (!serial && (ndeps > 0 || ndeps_noalias > 0)) {
594596
/* if no dependencies have been tracked yet, create the dependence hash */

0 commit comments

Comments
 (0)