Skip to content

Commit 8480ed9

Browse files
committed
xen/balloon: use a kernel thread instead a workqueue
Today the Xen ballooning is done via delayed work in a workqueue. This might result in workqueue hangups being reported in case of large amounts of memory are being ballooned in one go (here 16GB): BUG: workqueue lockup - pool cpus=6 node=0 flags=0x0 nice=0 stuck for 64s! Showing busy workqueues and worker pools: workqueue events: flags=0x0 pwq 12: cpus=6 node=0 flags=0x0 nice=0 active=2/256 refcnt=3 in-flight: 229:balloon_process pending: cache_reap workqueue events_freezable_power_: flags=0x84 pwq 12: cpus=6 node=0 flags=0x0 nice=0 active=1/256 refcnt=2 pending: disk_events_workfn workqueue mm_percpu_wq: flags=0x8 pwq 12: cpus=6 node=0 flags=0x0 nice=0 active=1/256 refcnt=2 pending: vmstat_update pool 12: cpus=6 node=0 flags=0x0 nice=0 hung=64s workers=3 idle: 2222 43 This can easily be avoided by using a dedicated kernel thread for doing the ballooning work. Reported-by: Jan Beulich <[email protected]> Signed-off-by: Juergen Gross <[email protected]> Reviewed-by: Boris Ostrovsky <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Juergen Gross <[email protected]>
1 parent 58e6360 commit 8480ed9

File tree

1 file changed

+45
-17
lines changed

1 file changed

+45
-17
lines changed

drivers/xen/balloon.c

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
#include <linux/sched.h>
4444
#include <linux/cred.h>
4545
#include <linux/errno.h>
46+
#include <linux/freezer.h>
47+
#include <linux/kthread.h>
4648
#include <linux/mm.h>
4749
#include <linux/memblock.h>
4850
#include <linux/pagemap.h>
@@ -115,7 +117,7 @@ static struct ctl_table xen_root[] = {
115117
#define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1)
116118

117119
/*
118-
* balloon_process() state:
120+
* balloon_thread() state:
119121
*
120122
* BP_DONE: done or nothing to do,
121123
* BP_WAIT: wait to be rescheduled,
@@ -130,6 +132,8 @@ enum bp_state {
130132
BP_ECANCELED
131133
};
132134

135+
/* Main waiting point for xen-balloon thread. */
136+
static DECLARE_WAIT_QUEUE_HEAD(balloon_thread_wq);
133137

134138
static DEFINE_MUTEX(balloon_mutex);
135139

@@ -144,10 +148,6 @@ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)];
144148
static LIST_HEAD(ballooned_pages);
145149
static DECLARE_WAIT_QUEUE_HEAD(balloon_wq);
146150

147-
/* Main work function, always executed in process context. */
148-
static void balloon_process(struct work_struct *work);
149-
static DECLARE_DELAYED_WORK(balloon_worker, balloon_process);
150-
151151
/* When ballooning out (allocating memory to return to Xen) we don't really
152152
want the kernel to try too hard since that can trigger the oom killer. */
153153
#define GFP_BALLOON \
@@ -366,7 +366,7 @@ static void xen_online_page(struct page *page, unsigned int order)
366366
static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v)
367367
{
368368
if (val == MEM_ONLINE)
369-
schedule_delayed_work(&balloon_worker, 0);
369+
wake_up(&balloon_thread_wq);
370370

371371
return NOTIFY_OK;
372372
}
@@ -491,18 +491,43 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
491491
}
492492

493493
/*
494-
* As this is a work item it is guaranteed to run as a single instance only.
494+
* Stop waiting if either state is not BP_EAGAIN and ballooning action is
495+
* needed, or if the credit has changed while state is BP_EAGAIN.
496+
*/
497+
static bool balloon_thread_cond(enum bp_state state, long credit)
498+
{
499+
if (state != BP_EAGAIN)
500+
credit = 0;
501+
502+
return current_credit() != credit || kthread_should_stop();
503+
}
504+
505+
/*
506+
* As this is a kthread it is guaranteed to run as a single instance only.
495507
* We may of course race updates of the target counts (which are protected
496508
* by the balloon lock), or with changes to the Xen hard limit, but we will
497509
* recover from these in time.
498510
*/
499-
static void balloon_process(struct work_struct *work)
511+
static int balloon_thread(void *unused)
500512
{
501513
enum bp_state state = BP_DONE;
502514
long credit;
515+
unsigned long timeout;
516+
517+
set_freezable();
518+
for (;;) {
519+
if (state == BP_EAGAIN)
520+
timeout = balloon_stats.schedule_delay * HZ;
521+
else
522+
timeout = 3600 * HZ;
523+
credit = current_credit();
503524

525+
wait_event_interruptible_timeout(balloon_thread_wq,
526+
balloon_thread_cond(state, credit), timeout);
527+
528+
if (kthread_should_stop())
529+
return 0;
504530

505-
do {
506531
mutex_lock(&balloon_mutex);
507532

508533
credit = current_credit();
@@ -529,20 +554,15 @@ static void balloon_process(struct work_struct *work)
529554
mutex_unlock(&balloon_mutex);
530555

531556
cond_resched();
532-
533-
} while (credit && state == BP_DONE);
534-
535-
/* Schedule more work if there is some still to be done. */
536-
if (state == BP_EAGAIN)
537-
schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ);
557+
}
538558
}
539559

540560
/* Resets the Xen limit, sets new target, and kicks off processing. */
541561
void balloon_set_new_target(unsigned long target)
542562
{
543563
/* No need for lock. Not read-modify-write updates. */
544564
balloon_stats.target_pages = target;
545-
schedule_delayed_work(&balloon_worker, 0);
565+
wake_up(&balloon_thread_wq);
546566
}
547567
EXPORT_SYMBOL_GPL(balloon_set_new_target);
548568

@@ -647,7 +667,7 @@ void free_xenballooned_pages(int nr_pages, struct page **pages)
647667

648668
/* The balloon may be too large now. Shrink it if needed. */
649669
if (current_credit())
650-
schedule_delayed_work(&balloon_worker, 0);
670+
wake_up(&balloon_thread_wq);
651671

652672
mutex_unlock(&balloon_mutex);
653673
}
@@ -679,6 +699,8 @@ static void __init balloon_add_region(unsigned long start_pfn,
679699

680700
static int __init balloon_init(void)
681701
{
702+
struct task_struct *task;
703+
682704
if (!xen_domain())
683705
return -ENODEV;
684706

@@ -722,6 +744,12 @@ static int __init balloon_init(void)
722744
}
723745
#endif
724746

747+
task = kthread_run(balloon_thread, NULL, "xen-balloon");
748+
if (IS_ERR(task)) {
749+
pr_err("xen-balloon thread could not be started, ballooning will not work!\n");
750+
return PTR_ERR(task);
751+
}
752+
725753
/* Init the xen-balloon driver. */
726754
xen_balloon_init();
727755

0 commit comments

Comments
 (0)