Skip to content

Commit df5aa2b

Browse files
lateralusXUnityAlex
authored andcommitted
Fix race condition in WaitAnyWithSecondMutexAbandoned test.
There is a small race condition during the completion of a native thread join call. During that period of time the tid is no longer included on the internal list tracking joinable threads so a thread that will join on the tid while another thread (like the finalizer thread) is waiting on native join to complete for the same tid, will cause the managed Thread.Join call to complete before the native join call has completed. This race could cause issues on some OS:es that clear's up some thread resources, like abandoned mutexes when the thread has exited. This race is hit by WaitAnyWithSecondMutexAbandoned since the call to Thread.Join will return before the thread owning the mutex has terminated meaning that it doesn't get ownership of the abandoned mutex as assumed by the test. Fix makes sure Thread.Join won't complete until native thread join is complete. Increasing the join timeouts in the test also eliminates a timeout error making it harder to hit the problematic code path, primarily during reproduction in the debugger. Fix also switch to coop mutex for joinable threads.
1 parent 6e61271 commit df5aa2b

File tree

2 files changed

+84
-19
lines changed

2 files changed

+84
-19
lines changed

mcs/class/corlib/Test/System.Threading/WaitHandleTest.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ public void WaitOneWithAbandonedMutex ()
409409
m.WaitOne ();
410410
});
411411
thread1.Start ();
412-
thread1.Join (1000);
412+
Assert.IsTrue (thread1.Join (Timeout.Infinite), "thread1.Join");
413413
try {
414414
m.WaitOne ();
415415
Assert.Fail ("Expected AbandonedMutexException");
@@ -421,7 +421,7 @@ public void WaitOneWithAbandonedMutex ()
421421
signalled = m.WaitOne (100);
422422
});
423423
thread2.Start ();
424-
thread2.Join (1000);
424+
Assert.IsTrue (thread2.Join (Timeout.Infinite), "thread2.Join");
425425
Assert.IsFalse (signalled);
426426

427427
// Since this thread owns the Mutex releasing it shouldn't fail
@@ -489,7 +489,7 @@ public void WaitAnyWithSecondMutexAbandoned ()
489489
m1.ReleaseMutex ();
490490
});
491491
thread1.Start ();
492-
thread1.Join (1000);
492+
Assert.IsTrue (thread1.Join (Timeout.Infinite), "thread1.Join");
493493
thread2.Start ();
494494
while (!mainProceed) {
495495
Thread.Sleep (10);
@@ -502,7 +502,7 @@ public void WaitAnyWithSecondMutexAbandoned ()
502502
Assert.AreEqual (m2, e.Mutex);
503503
} finally {
504504
thread2Proceed = true;
505-
thread2.Join (1000);
505+
Assert.IsTrue (thread2.Join (Timeout.Infinite), "thread2.Join");
506506
}
507507

508508
// Current thread should own the second Mutex now
@@ -511,7 +511,7 @@ public void WaitAnyWithSecondMutexAbandoned ()
511511
signalled = WaitHandle.WaitAny (new WaitHandle [] { m1, m2 }, 0);
512512
});
513513
thread3.Start ();
514-
thread3.Join (1000);
514+
Assert.IsTrue (thread3.Join (Timeout.Infinite), "thread3.Join");
515515
Assert.AreEqual (0, signalled);
516516

517517
// Since this thread owns the second Mutex releasing it shouldn't fail
@@ -540,7 +540,7 @@ public void WaitAllWithOneAbandonedMutex ()
540540
m1.WaitOne ();
541541
});
542542
thread.Start ();
543-
thread.Join (1000);
543+
Assert.IsTrue (thread.Join (Timeout.Infinite), "thread.Join");
544544
WaitHandle.WaitAll (new WaitHandle [] { m1, m2 });
545545
}
546546
}

mono/metadata/threads.c

Lines changed: 78 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,9 @@ static void mono_threads_unlock (void);
128128
static MonoCoopMutex threads_mutex;
129129

130130
/* Controls access to the 'joinable_threads' hash table */
131-
#define joinable_threads_lock() mono_os_mutex_lock (&joinable_threads_mutex)
132-
#define joinable_threads_unlock() mono_os_mutex_unlock (&joinable_threads_mutex)
133-
static mono_mutex_t joinable_threads_mutex;
131+
#define joinable_threads_lock() mono_coop_mutex_lock (&joinable_threads_mutex)
132+
#define joinable_threads_unlock() mono_coop_mutex_unlock (&joinable_threads_mutex)
133+
static MonoCoopMutex joinable_threads_mutex;
134134

135135
/* Holds current status of static data heap */
136136
static StaticDataInfo thread_static_info;
@@ -161,10 +161,19 @@ static MonoGHashTable *threads_starting_up = NULL;
161161
static GHashTable *joinable_threads;
162162
static gint32 joinable_thread_count;
163163

164+
/* mono_threads_join_threads will take threads from joinable_threads list and wait for them. */
165+
/* When this happens, the tid is not on the list anymore so mono_thread_join assumes the thread has complete */
166+
/* and will return back to the caller. This could cause a race since caller of join assumes thread has completed */
167+
/* and on some OS it could cause errors. Keeping the tid's currently pending a native thread join call */
168+
/* in a separate table (only affecting callers interested in this internal join detail) and look at that table in mono_thread_join */
169+
/* will close this race. */
170+
static GHashTable *pending_native_thread_join_calls;
171+
static MonoCoopCond pending_native_thread_join_calls_event;
172+
164173
static GHashTable *pending_joinable_threads;
165174
static gint32 pending_joinable_thread_count;
166175

167-
static mono_cond_t zero_pending_joinable_thread_event;
176+
static MonoCoopCond zero_pending_joinable_thread_event;
168177

169178
static void threads_add_pending_joinable_runtime_thread (MonoThreadInfo *mono_thread_info);
170179
static gboolean threads_wait_pending_joinable_threads (uint32_t timeout);
@@ -3036,11 +3045,12 @@ void mono_thread_init (MonoThreadStartCB start_cb,
30363045
mono_coop_mutex_init_recursive (&threads_mutex);
30373046

30383047
mono_os_mutex_init_recursive(&interlocked_mutex);
3039-
mono_os_mutex_init_recursive(&joinable_threads_mutex);
3048+
mono_coop_mutex_init_recursive(&joinable_threads_mutex);
30403049

30413050
mono_os_event_init (&background_change_event, FALSE);
30423051

3043-
mono_os_cond_init (&zero_pending_joinable_thread_event);
3052+
mono_coop_cond_init (&pending_native_thread_join_calls_event);
3053+
mono_coop_cond_init (&zero_pending_joinable_thread_event);
30443054

30453055
mono_init_static_data_info (&thread_static_info);
30463056
mono_init_static_data_info (&context_static_info);
@@ -3161,7 +3171,8 @@ mono_thread_cleanup (void)
31613171
mono_os_mutex_destroy (&interlocked_mutex);
31623172
mono_os_mutex_destroy (&delayed_free_table_mutex);
31633173
mono_os_mutex_destroy (&small_id_mutex);
3164-
mono_os_cond_destroy (&zero_pending_joinable_runtime_thread_event);
3174+
mono_coop_cond_destroy (&zero_pending_joinable_thread_event);
3175+
mono_coop_cond_destroy (&pending_native_thread_join_calls_event);
31653176
mono_os_event_destroy (&background_change_event);
31663177
#endif
31673178
}
@@ -5280,7 +5291,7 @@ threads_remove_pending_joinable_thread_nolock (gpointer tid)
52805291
if (pending_joinable_threads && g_hash_table_lookup_extended (pending_joinable_threads, tid, &orig_key, &value)) {
52815292
g_hash_table_remove (pending_joinable_threads, tid);
52825293
if (UnlockedDecrement (&pending_joinable_thread_count) == 0)
5283-
mono_os_cond_broadcast (&zero_pending_joinable_thread_event);
5294+
mono_coop_cond_broadcast (&zero_pending_joinable_thread_event);
52845295
}
52855296
}
52865297

@@ -5291,12 +5302,12 @@ threads_wait_pending_joinable_threads (uint32_t timeout)
52915302
joinable_threads_lock ();
52925303
if (timeout == MONO_INFINITE_WAIT) {
52935304
while (UnlockedRead (&pending_joinable_thread_count) > 0)
5294-
mono_os_cond_wait (&zero_pending_joinable_thread_event, &joinable_threads_mutex);
5305+
mono_coop_cond_wait (&zero_pending_joinable_thread_event, &joinable_threads_mutex);
52955306
} else {
52965307
gint64 start = mono_msec_ticks ();
52975308
gint64 elapsed = 0;
52985309
while (UnlockedRead (&pending_joinable_thread_count) > 0 && elapsed < timeout) {
5299-
mono_os_cond_timedwait (&zero_pending_joinable_thread_event, &joinable_threads_mutex, timeout - (uint32_t)elapsed);
5310+
mono_coop_cond_timedwait (&zero_pending_joinable_thread_event, &joinable_threads_mutex, timeout - (uint32_t)elapsed);
53005311
elapsed = mono_msec_ticks () - start;
53015312
}
53025313
}
@@ -5344,6 +5355,39 @@ mono_threads_add_joinable_runtime_thread (gpointer thread_info)
53445355
}
53455356
}
53465357

5358+
static void
5359+
threads_add_pending_native_thread_join_call_nolock (gpointer tid)
5360+
{
5361+
if (!pending_native_thread_join_calls)
5362+
pending_native_thread_join_calls = g_hash_table_new (NULL, NULL);
5363+
5364+
gpointer orig_key;
5365+
gpointer value;
5366+
5367+
if (!g_hash_table_lookup_extended (pending_native_thread_join_calls, tid, &orig_key, &value))
5368+
g_hash_table_insert (pending_native_thread_join_calls, tid, tid);
5369+
}
5370+
5371+
static void
5372+
threads_remove_pending_native_thread_join_call_nolock (gpointer tid)
5373+
{
5374+
if (pending_native_thread_join_calls)
5375+
g_hash_table_remove (pending_native_thread_join_calls, tid);
5376+
5377+
mono_coop_cond_broadcast (&pending_native_thread_join_calls_event);
5378+
}
5379+
5380+
static void
5381+
threads_wait_pending_native_thread_join_call_nolock (gpointer tid)
5382+
{
5383+
gpointer orig_key;
5384+
gpointer value;
5385+
5386+
while (g_hash_table_lookup_extended (pending_native_thread_join_calls, tid, &orig_key, &value)) {
5387+
mono_coop_cond_wait (&pending_native_thread_join_calls_event, &joinable_threads_mutex);
5388+
}
5389+
}
5390+
53475391
/*
53485392
* mono_add_joinable_thread:
53495393
*
@@ -5375,23 +5419,30 @@ void
53755419
mono_threads_join_threads (void)
53765420
{
53775421
GHashTableIter iter;
5378-
gpointer key;
5379-
gpointer value;
5380-
gboolean found;
5422+
gpointer key = NULL;
5423+
gpointer value = NULL;
5424+
gboolean found = FALSE;
53815425

53825426
/* Fastpath */
53835427
if (!UnlockedRead (&joinable_thread_count))
53845428
return;
53855429

53865430
while (TRUE) {
53875431
joinable_threads_lock ();
5432+
if (found) {
5433+
// Previous native thread join call completed.
5434+
threads_remove_pending_native_thread_join_call_nolock (key);
5435+
}
53885436
found = FALSE;
53895437
if (g_hash_table_size (joinable_threads)) {
53905438
g_hash_table_iter_init (&iter, joinable_threads);
53915439
g_hash_table_iter_next (&iter, &key, (void**)&value);
53925440
g_hash_table_remove (joinable_threads, key);
53935441
UnlockedDecrement (&joinable_thread_count);
53945442
found = TRUE;
5443+
5444+
// Add to table of tid's with pending native thread join call.
5445+
threads_add_pending_native_thread_join_call_nolock (key);
53955446
}
53965447
joinable_threads_unlock ();
53975448
if (found)
@@ -5422,13 +5473,27 @@ mono_thread_join (gpointer tid)
54225473
g_hash_table_remove (joinable_threads, tid);
54235474
UnlockedDecrement (&joinable_thread_count);
54245475
found = TRUE;
5476+
5477+
// Add to table of tid's with pending native join call.
5478+
threads_add_pending_native_thread_join_call_nolock (tid);
5479+
}
5480+
5481+
if (!found) {
5482+
// Wait for any pending native thread join call not yet completed for this tid.
5483+
threads_wait_pending_native_thread_join_call_nolock (tid);
54255484
}
5485+
54265486
joinable_threads_unlock ();
54275487

54285488
if (!found)
54295489
return;
54305490

54315491
threads_native_thread_join_nolock (tid, value);
5492+
5493+
joinable_threads_lock ();
5494+
// Native thread join call completed for this tid.
5495+
threads_remove_pending_native_thread_join_call_nolock (tid);
5496+
joinable_threads_unlock ();
54325497
}
54335498

54345499
void

0 commit comments

Comments
 (0)