Skip to content

Commit 05715ab

Browse files
KAGA-KOKOjnettlet
authored andcommitted
workqueue: Prevent deadlock/stall on RT
Austin reported a XFS deadlock/stall on RT where scheduled work gets never exececuted and tasks are waiting for each other for ever. The underlying problem is the modification of the RT code to the handling of workers which are about to go to sleep. In mainline a worker thread which goes to sleep wakes an idle worker if there is more work to do. This happens from the guts of the schedule() function. On RT this must be outside and the accessed data structures are not protected against scheduling due to the spinlock to rtmutex conversion. So the naive solution to this was to move the code outside of the scheduler and protect the data structures by the pool lock. That approach turned out to be a little naive as we cannot call into that code when the thread blocks on a lock, as it is not allowed to block on two locks in parallel. So we dont call into the worker wakeup magic when the worker is blocked on a lock, which causes the deadlock/stall observed by Austin and Mike. Looking deeper into that worker code it turns out that the only relevant data structure which needs to be protected is the list of idle workers which can be woken up. So the solution is to protect the list manipulation operations with preempt_enable/disable pairs on RT and call unconditionally into the worker code even when the worker is blocked on a lock. The preemption protection is safe as there is nothing which can fiddle with the list outside of thread context. Reported-and_tested-by: Austin Schuh <[email protected]> Reported-and_tested-by: Mike Galbraith <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]> Link: http://vger.kernel.org/r/alpine.DEB.2.10.1406271249510.5170@nanos Cc: Richard Weinberger <[email protected]> Cc: Steven Rostedt <[email protected]>
1 parent 3c6ebe7 commit 05715ab

File tree

2 files changed

+52
-15
lines changed

2 files changed

+52
-15
lines changed

kernel/sched/core.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3676,16 +3676,19 @@ void __noreturn do_task_dead(void)
36763676

36773677
static inline void sched_submit_work(struct task_struct *tsk)
36783678
{
3679-
if (!tsk->state || tsk_is_pi_blocked(tsk))
3679+
if (!tsk->state)
36803680
return;
3681-
36823681
/*
36833682
* If a worker went to sleep, notify and ask workqueue whether
36843683
* it wants to wake up a task to maintain concurrency.
36853684
*/
36863685
if (tsk->flags & PF_WQ_WORKER)
36873686
wq_worker_sleeping(tsk);
36883687

3688+
3689+
if (tsk_is_pi_blocked(tsk))
3690+
return;
3691+
36893692
/*
36903693
* If we are going to sleep and we have plugged IO queued,
36913694
* make sure to submit it to avoid deadlocks.

kernel/workqueue.c

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,11 @@ enum {
125125
* cpu or grabbing pool->lock is enough for read access. If
126126
* POOL_DISASSOCIATED is set, it's identical to L.
127127
*
128+
* On RT we need the extra protection via rt_lock_idle_list() for
129+
* the list manipulations against read access from
130+
* wq_worker_sleeping(). All other places are nicely serialized via
131+
* pool->lock.
132+
*
128133
* A: pool->attach_mutex protected.
129134
*
130135
* PL: wq_pool_mutex protected.
@@ -430,6 +435,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
430435
if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
431436
else
432437

438+
#ifdef CONFIG_PREEMPT_RT_BASE
439+
static inline void rt_lock_idle_list(struct worker_pool *pool)
440+
{
441+
preempt_disable();
442+
}
443+
static inline void rt_unlock_idle_list(struct worker_pool *pool)
444+
{
445+
preempt_enable();
446+
}
447+
static inline void sched_lock_idle_list(struct worker_pool *pool) { }
448+
static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
449+
#else
450+
static inline void rt_lock_idle_list(struct worker_pool *pool) { }
451+
static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
452+
static inline void sched_lock_idle_list(struct worker_pool *pool)
453+
{
454+
spin_lock_irq(&pool->lock);
455+
}
456+
static inline void sched_unlock_idle_list(struct worker_pool *pool)
457+
{
458+
spin_unlock_irq(&pool->lock);
459+
}
460+
#endif
461+
462+
433463
#ifdef CONFIG_DEBUG_OBJECTS_WORK
434464

435465
static struct debug_obj_descr work_debug_descr;
@@ -836,10 +866,16 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
836866
*/
837867
static void wake_up_worker(struct worker_pool *pool)
838868
{
839-
struct worker *worker = first_idle_worker(pool);
869+
struct worker *worker;
870+
871+
rt_lock_idle_list(pool);
872+
873+
worker = first_idle_worker(pool);
840874

841875
if (likely(worker))
842876
wake_up_process(worker->task);
877+
878+
rt_unlock_idle_list(pool);
843879
}
844880

845881
/**
@@ -868,7 +904,7 @@ void wq_worker_running(struct task_struct *task)
868904
*/
869905
void wq_worker_sleeping(struct task_struct *task)
870906
{
871-
struct worker *next, *worker = kthread_data(task);
907+
struct worker *worker = kthread_data(task);
872908
struct worker_pool *pool;
873909

874910
/*
@@ -885,26 +921,18 @@ void wq_worker_sleeping(struct task_struct *task)
885921
return;
886922

887923
worker->sleeping = 1;
888-
spin_lock_irq(&pool->lock);
889924

890925
/*
891926
* The counterpart of the following dec_and_test, implied mb,
892927
* worklist not empty test sequence is in insert_work().
893928
* Please read comment there.
894-
*
895-
* NOT_RUNNING is clear. This means that we're bound to and
896-
* running on the local cpu w/ rq lock held and preemption
897-
* disabled, which in turn means that none else could be
898-
* manipulating idle_list, so dereferencing idle_list without pool
899-
* lock is safe.
900929
*/
901930
if (atomic_dec_and_test(&pool->nr_running) &&
902931
!list_empty(&pool->worklist)) {
903-
next = first_idle_worker(pool);
904-
if (next)
905-
wake_up_process(next->task);
932+
sched_lock_idle_list(pool);
933+
wake_up_worker(pool);
934+
sched_unlock_idle_list(pool);
906935
}
907-
spin_unlock_irq(&pool->lock);
908936
}
909937

910938
/**
@@ -1634,7 +1662,9 @@ static void worker_enter_idle(struct worker *worker)
16341662
worker->last_active = jiffies;
16351663

16361664
/* idle_list is LIFO */
1665+
rt_lock_idle_list(pool);
16371666
list_add(&worker->entry, &pool->idle_list);
1667+
rt_unlock_idle_list(pool);
16381668

16391669
if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
16401670
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
@@ -1667,7 +1697,9 @@ static void worker_leave_idle(struct worker *worker)
16671697
return;
16681698
worker_clr_flags(worker, WORKER_IDLE);
16691699
pool->nr_idle--;
1700+
rt_lock_idle_list(pool);
16701701
list_del_init(&worker->entry);
1702+
rt_unlock_idle_list(pool);
16711703
}
16721704

16731705
static struct worker *alloc_worker(int node)
@@ -1833,7 +1865,9 @@ static void destroy_worker(struct worker *worker)
18331865
pool->nr_workers--;
18341866
pool->nr_idle--;
18351867

1868+
rt_lock_idle_list(pool);
18361869
list_del_init(&worker->entry);
1870+
rt_unlock_idle_list(pool);
18371871
worker->flags |= WORKER_DIE;
18381872
wake_up_process(worker->task);
18391873
}

0 commit comments

Comments
 (0)