Skip to content

Commit 1a77ffc

Browse files
bonzinigregkh
authored andcommitted
KVM: x86: switch hugepage recovery thread to vhost_task
commit d96c77b upstream. kvm_vm_create_worker_thread() is meant to be used for kthreads that can consume significant amounts of CPU time on behalf of a VM or in response to how the VM behaves (for example how it accesses its memory). Therefore it wants to charge the CPU time consumed by that work to the VM's container. However, because of these threads, cgroups which have kvm instances inside never complete freezing. This can be trivially reproduced: root@test ~# mkdir /sys/fs/cgroup/test root@test ~# echo $$ > /sys/fs/cgroup/test/cgroup.procs root@test ~# qemu-system-x86_64 -nographic -enable-kvm and in another terminal: root@test ~# echo 1 > /sys/fs/cgroup/test/cgroup.freeze root@test ~# cat /sys/fs/cgroup/test/cgroup.events populated 1 frozen 0 The cgroup freezing happens in the signal delivery path but kvm_nx_huge_page_recovery_worker, while joining non-root cgroups, never calls into the signal delivery path and thus never gets frozen. Because the cgroup freezer determines whether a given cgroup is frozen by comparing the number of frozen threads to the total number of threads in the cgroup, the cgroup never becomes frozen and users waiting for the state transition may hang indefinitely. Since the worker kthread is tied to a user process, it's better if it behaves similarly to user tasks as much as possible, including being able to send SIGSTOP and SIGCONT. In fact, vhost_task is all that kvm_vm_create_worker_thread() wanted to be and more: not only it inherits the userspace process's cgroups, it has other niceties like being parented properly in the process tree. Use it instead of the homegrown alternative. Incidentally, the new code is also better behaved when you flip recovery back and forth to disabled and back to enabled. If your recovery period is 1 minute, it will run the next recovery after 1 minute independent of how many times you flipped the parameter. (Commit message based on emails from Tejun). Reported-by: Tejun Heo <[email protected]> Reported-by: Luca Boccassi <[email protected]> Acked-by: Tejun Heo <[email protected]> Tested-by: Luca Boccassi <[email protected]> Cc: [email protected] Reviewed-by: Sean Christopherson <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 49f7fd0 commit 1a77ffc

File tree

5 files changed

+35
-147
lines changed

5 files changed

+35
-147
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/irqbypass.h>
2727
#include <linux/hyperv.h>
2828
#include <linux/kfifo.h>
29+
#include <linux/sched/vhost_task.h>
2930

3031
#include <asm/apic.h>
3132
#include <asm/pvclock-abi.h>
@@ -1445,7 +1446,8 @@ struct kvm_arch {
14451446
bool sgx_provisioning_allowed;
14461447

14471448
struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
1448-
struct task_struct *nx_huge_page_recovery_thread;
1449+
struct vhost_task *nx_huge_page_recovery_thread;
1450+
u64 nx_huge_page_last;
14491451

14501452
#ifdef CONFIG_X86_64
14511453
/* The number of TDP MMU pages across all roots. */

arch/x86/kvm/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ config KVM
2929
select HAVE_KVM_IRQ_BYPASS
3030
select HAVE_KVM_IRQ_ROUTING
3131
select HAVE_KVM_READONLY_MEM
32+
select VHOST_TASK
3233
select KVM_ASYNC_PF
3334
select USER_RETURN_NOTIFIER
3435
select KVM_MMIO

arch/x86/kvm/mmu/mmu.c

Lines changed: 31 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -7160,7 +7160,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
71607160
kvm_mmu_zap_all_fast(kvm);
71617161
mutex_unlock(&kvm->slots_lock);
71627162

7163-
wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
7163+
vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
71647164
}
71657165
mutex_unlock(&kvm_lock);
71667166
}
@@ -7306,7 +7306,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
73067306
mutex_lock(&kvm_lock);
73077307

73087308
list_for_each_entry(kvm, &vm_list, vm_list)
7309-
wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
7309+
vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
73107310

73117311
mutex_unlock(&kvm_lock);
73127312
}
@@ -7409,62 +7409,56 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
74097409
srcu_read_unlock(&kvm->srcu, rcu_idx);
74107410
}
74117411

7412-
static long get_nx_huge_page_recovery_timeout(u64 start_time)
7412+
static void kvm_nx_huge_page_recovery_worker_kill(void *data)
74137413
{
7414-
bool enabled;
7415-
uint period;
7416-
7417-
enabled = calc_nx_huge_pages_recovery_period(&period);
7418-
7419-
return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
7420-
: MAX_SCHEDULE_TIMEOUT;
74217414
}
74227415

7423-
static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
7416+
static bool kvm_nx_huge_page_recovery_worker(void *data)
74247417
{
7425-
u64 start_time;
7418+
struct kvm *kvm = data;
7419+
bool enabled;
7420+
uint period;
74267421
long remaining_time;
74277422

7428-
while (true) {
7429-
start_time = get_jiffies_64();
7430-
remaining_time = get_nx_huge_page_recovery_timeout(start_time);
7431-
7432-
set_current_state(TASK_INTERRUPTIBLE);
7433-
while (!kthread_should_stop() && remaining_time > 0) {
7434-
schedule_timeout(remaining_time);
7435-
remaining_time = get_nx_huge_page_recovery_timeout(start_time);
7436-
set_current_state(TASK_INTERRUPTIBLE);
7437-
}
7438-
7439-
set_current_state(TASK_RUNNING);
7440-
7441-
if (kthread_should_stop())
7442-
return 0;
7423+
enabled = calc_nx_huge_pages_recovery_period(&period);
7424+
if (!enabled)
7425+
return false;
74437426

7444-
kvm_recover_nx_huge_pages(kvm);
7427+
remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period)
7428+
- get_jiffies_64();
7429+
if (remaining_time > 0) {
7430+
schedule_timeout(remaining_time);
7431+
/* check for signals and come back */
7432+
return true;
74457433
}
7434+
7435+
__set_current_state(TASK_RUNNING);
7436+
kvm_recover_nx_huge_pages(kvm);
7437+
kvm->arch.nx_huge_page_last = get_jiffies_64();
7438+
return true;
74467439
}
74477440

74487441
int kvm_mmu_post_init_vm(struct kvm *kvm)
74497442
{
7450-
int err;
7451-
74527443
if (nx_hugepage_mitigation_hard_disabled)
74537444
return 0;
74547445

7455-
err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
7456-
"kvm-nx-lpage-recovery",
7457-
&kvm->arch.nx_huge_page_recovery_thread);
7458-
if (!err)
7459-
kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
7446+
kvm->arch.nx_huge_page_last = get_jiffies_64();
7447+
kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
7448+
kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
7449+
kvm, "kvm-nx-lpage-recovery");
74607450

7461-
return err;
7451+
if (!kvm->arch.nx_huge_page_recovery_thread)
7452+
return -ENOMEM;
7453+
7454+
vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
7455+
return 0;
74627456
}
74637457

74647458
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
74657459
{
74667460
if (kvm->arch.nx_huge_page_recovery_thread)
7467-
kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
7461+
vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
74687462
}
74697463

74707464
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES

include/linux/kvm_host.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2370,12 +2370,6 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
23702370
}
23712371
#endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
23722372

2373-
typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
2374-
2375-
int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
2376-
uintptr_t data, const char *name,
2377-
struct task_struct **thread_ptr);
2378-
23792373
#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
23802374
static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
23812375
{

virt/kvm/kvm_main.c

Lines changed: 0 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -6573,106 +6573,3 @@ void kvm_exit(void)
65736573
kvm_irqfd_exit();
65746574
}
65756575
EXPORT_SYMBOL_GPL(kvm_exit);
6576-
6577-
struct kvm_vm_worker_thread_context {
6578-
struct kvm *kvm;
6579-
struct task_struct *parent;
6580-
struct completion init_done;
6581-
kvm_vm_thread_fn_t thread_fn;
6582-
uintptr_t data;
6583-
int err;
6584-
};
6585-
6586-
static int kvm_vm_worker_thread(void *context)
6587-
{
6588-
/*
6589-
* The init_context is allocated on the stack of the parent thread, so
6590-
* we have to locally copy anything that is needed beyond initialization
6591-
*/
6592-
struct kvm_vm_worker_thread_context *init_context = context;
6593-
struct task_struct *parent;
6594-
struct kvm *kvm = init_context->kvm;
6595-
kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6596-
uintptr_t data = init_context->data;
6597-
int err;
6598-
6599-
err = kthread_park(current);
6600-
/* kthread_park(current) is never supposed to return an error */
6601-
WARN_ON(err != 0);
6602-
if (err)
6603-
goto init_complete;
6604-
6605-
err = cgroup_attach_task_all(init_context->parent, current);
6606-
if (err) {
6607-
kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6608-
__func__, err);
6609-
goto init_complete;
6610-
}
6611-
6612-
set_user_nice(current, task_nice(init_context->parent));
6613-
6614-
init_complete:
6615-
init_context->err = err;
6616-
complete(&init_context->init_done);
6617-
init_context = NULL;
6618-
6619-
if (err)
6620-
goto out;
6621-
6622-
/* Wait to be woken up by the spawner before proceeding. */
6623-
kthread_parkme();
6624-
6625-
if (!kthread_should_stop())
6626-
err = thread_fn(kvm, data);
6627-
6628-
out:
6629-
/*
6630-
* Move kthread back to its original cgroup to prevent it lingering in
6631-
* the cgroup of the VM process, after the latter finishes its
6632-
* execution.
6633-
*
6634-
* kthread_stop() waits on the 'exited' completion condition which is
6635-
* set in exit_mm(), via mm_release(), in do_exit(). However, the
6636-
* kthread is removed from the cgroup in the cgroup_exit() which is
6637-
* called after the exit_mm(). This causes the kthread_stop() to return
6638-
* before the kthread actually quits the cgroup.
6639-
*/
6640-
rcu_read_lock();
6641-
parent = rcu_dereference(current->real_parent);
6642-
get_task_struct(parent);
6643-
rcu_read_unlock();
6644-
cgroup_attach_task_all(parent, current);
6645-
put_task_struct(parent);
6646-
6647-
return err;
6648-
}
6649-
6650-
int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6651-
uintptr_t data, const char *name,
6652-
struct task_struct **thread_ptr)
6653-
{
6654-
struct kvm_vm_worker_thread_context init_context = {};
6655-
struct task_struct *thread;
6656-
6657-
*thread_ptr = NULL;
6658-
init_context.kvm = kvm;
6659-
init_context.parent = current;
6660-
init_context.thread_fn = thread_fn;
6661-
init_context.data = data;
6662-
init_completion(&init_context.init_done);
6663-
6664-
thread = kthread_run(kvm_vm_worker_thread, &init_context,
6665-
"%s-%d", name, task_pid_nr(current));
6666-
if (IS_ERR(thread))
6667-
return PTR_ERR(thread);
6668-
6669-
/* kthread_run is never supposed to return NULL */
6670-
WARN_ON(thread == NULL);
6671-
6672-
wait_for_completion(&init_context.init_done);
6673-
6674-
if (!init_context.err)
6675-
*thread_ptr = thread;
6676-
6677-
return init_context.err;
6678-
}

0 commit comments

Comments
 (0)