Skip to content

Commit 7d9896e

Browse files
lulu-github-namemstsirkin
authored andcommitted
vhost: Reintroduce kthread API and add mode selection
Since commit 6e890c5 ("vhost: use vhost_tasks for worker threads"), the vhost uses vhost_task and operates as a child of the owner thread. This is required for correct CPU usage accounting, especially when using containers. However, this change has caused confusion for some legacy userspace applications, and we didn't notice until it's too late. Unfortunately, it's too late to revert - we now have userspace depending both on old and new behaviour :( To address the issue, reintroduce kthread mode for vhost workers and provide a configuration to select between kthread and task worker. - Add 'fork_owner' parameter to vhost_dev to let users select kthread or task mode. Default mode is task mode(VHOST_FORK_OWNER_TASK). - Reintroduce kthread mode support: * Bring back the original vhost_worker() implementation, and renamed to vhost_run_work_kthread_list(). * Add cgroup support for the kthread * Introduce struct vhost_worker_ops: - Encapsulates create / stop / wake‑up callbacks. - vhost_worker_create() selects the proper ops according to inherit_owner. - Userspace configuration interface: * New IOCTLs: - VHOST_SET_FORK_FROM_OWNER lets userspace select task mode (VHOST_FORK_OWNER_TASK) or kthread mode (VHOST_FORK_OWNER_KTHREAD) - VHOST_GET_FORK_FROM_OWNER reads the current worker mode * Expose module parameter 'fork_from_owner_default' to allow system administrators to configure the default mode for vhost workers * Kconfig option CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL controls whether these IOCTLs and the parameter are available - The VHOST_NEW_WORKER functionality requires fork_owner to be set to true, with validation added to ensure proper configuration This partially reverts or improves upon: commit 6e890c5 ("vhost: use vhost_tasks for worker threads") commit 1cdaafa ("vhost: replace single worker pointer with xarray") Fixes: 6e890c5 ("vhost: use vhost_tasks for worker threads"), Signed-off-by: Cindy Lu <[email protected]> Message-Id: <[email protected]> Signed-off-by: Michael S. Tsirkin <[email protected]> Acked-by: Jason Wang <[email protected]> Tested-by: Lei Yang <[email protected]>
1 parent d9ea58b commit 7d9896e

File tree

4 files changed

+295
-18
lines changed

4 files changed

+295
-18
lines changed

drivers/vhost/Kconfig

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,4 +95,22 @@ config VHOST_CROSS_ENDIAN_LEGACY
9595

9696
If unsure, say "N".
9797

98+
config VHOST_ENABLE_FORK_OWNER_CONTROL
99+
bool "Enable VHOST_ENABLE_FORK_OWNER_CONTROL"
100+
default y
101+
help
102+
This option enables two IOCTLs: VHOST_SET_FORK_FROM_OWNER and
103+
VHOST_GET_FORK_FROM_OWNER. These allow userspace applications
104+
to modify the vhost worker mode for vhost devices.
105+
106+
Also expose module parameter 'fork_from_owner_default' to allow users
107+
to configure the default mode for vhost workers.
108+
109+
By default, `VHOST_ENABLE_FORK_OWNER_CONTROL` is set to `y`,
110+
users can change the worker thread mode as needed.
111+
If this config is disabled (n),the related IOCTLs and parameters will
112+
be unavailable.
113+
114+
If unsure, say "Y".
115+
98116
endif

drivers/vhost/vhost.c

Lines changed: 226 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <linux/slab.h>
2323
#include <linux/vmalloc.h>
2424
#include <linux/kthread.h>
25+
#include <linux/cgroup.h>
2526
#include <linux/module.h>
2627
#include <linux/sort.h>
2728
#include <linux/sched/mm.h>
@@ -41,6 +42,13 @@ static int max_iotlb_entries = 2048;
4142
module_param(max_iotlb_entries, int, 0444);
4243
MODULE_PARM_DESC(max_iotlb_entries,
4344
"Maximum number of iotlb entries. (default: 2048)");
45+
static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK;
46+
47+
#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
48+
module_param(fork_from_owner_default, bool, 0444);
49+
MODULE_PARM_DESC(fork_from_owner_default,
50+
"Set task mode as the default(default: Y)");
51+
#endif
4452

4553
enum {
4654
VHOST_MEMORY_F_LOG = 0x1,
@@ -242,7 +250,7 @@ static void vhost_worker_queue(struct vhost_worker *worker,
242250
* test_and_set_bit() implies a memory barrier.
243251
*/
244252
llist_add(&work->node, &worker->work_list);
245-
vhost_task_wake(worker->vtsk);
253+
worker->ops->wakeup(worker);
246254
}
247255
}
248256

@@ -388,6 +396,44 @@ static void vhost_vq_reset(struct vhost_dev *dev,
388396
__vhost_vq_meta_reset(vq);
389397
}
390398

399+
static int vhost_run_work_kthread_list(void *data)
400+
{
401+
struct vhost_worker *worker = data;
402+
struct vhost_work *work, *work_next;
403+
struct vhost_dev *dev = worker->dev;
404+
struct llist_node *node;
405+
406+
kthread_use_mm(dev->mm);
407+
408+
for (;;) {
409+
/* mb paired w/ kthread_stop */
410+
set_current_state(TASK_INTERRUPTIBLE);
411+
412+
if (kthread_should_stop()) {
413+
__set_current_state(TASK_RUNNING);
414+
break;
415+
}
416+
node = llist_del_all(&worker->work_list);
417+
if (!node)
418+
schedule();
419+
420+
node = llist_reverse_order(node);
421+
/* make sure flag is seen after deletion */
422+
smp_wmb();
423+
llist_for_each_entry_safe(work, work_next, node, node) {
424+
clear_bit(VHOST_WORK_QUEUED, &work->flags);
425+
__set_current_state(TASK_RUNNING);
426+
kcov_remote_start_common(worker->kcov_handle);
427+
work->fn(work);
428+
kcov_remote_stop();
429+
cond_resched();
430+
}
431+
}
432+
kthread_unuse_mm(dev->mm);
433+
434+
return 0;
435+
}
436+
391437
static bool vhost_run_work_list(void *data)
392438
{
393439
struct vhost_worker *worker = data;
@@ -552,6 +598,7 @@ void vhost_dev_init(struct vhost_dev *dev,
552598
dev->byte_weight = byte_weight;
553599
dev->use_worker = use_worker;
554600
dev->msg_handler = msg_handler;
601+
dev->fork_owner = fork_from_owner_default;
555602
init_waitqueue_head(&dev->wait);
556603
INIT_LIST_HEAD(&dev->read_list);
557604
INIT_LIST_HEAD(&dev->pending_list);
@@ -581,6 +628,46 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
581628
}
582629
EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
583630

631+
struct vhost_attach_cgroups_struct {
632+
struct vhost_work work;
633+
struct task_struct *owner;
634+
int ret;
635+
};
636+
637+
static void vhost_attach_cgroups_work(struct vhost_work *work)
638+
{
639+
struct vhost_attach_cgroups_struct *s;
640+
641+
s = container_of(work, struct vhost_attach_cgroups_struct, work);
642+
s->ret = cgroup_attach_task_all(s->owner, current);
643+
}
644+
645+
static int vhost_attach_task_to_cgroups(struct vhost_worker *worker)
646+
{
647+
struct vhost_attach_cgroups_struct attach;
648+
int saved_cnt;
649+
650+
attach.owner = current;
651+
652+
vhost_work_init(&attach.work, vhost_attach_cgroups_work);
653+
vhost_worker_queue(worker, &attach.work);
654+
655+
mutex_lock(&worker->mutex);
656+
657+
/*
658+
* Bypass attachment_cnt check in __vhost_worker_flush:
659+
* Temporarily change it to INT_MAX to bypass the check
660+
*/
661+
saved_cnt = worker->attachment_cnt;
662+
worker->attachment_cnt = INT_MAX;
663+
__vhost_worker_flush(worker);
664+
worker->attachment_cnt = saved_cnt;
665+
666+
mutex_unlock(&worker->mutex);
667+
668+
return attach.ret;
669+
}
670+
584671
/* Caller should have device mutex */
585672
bool vhost_dev_has_owner(struct vhost_dev *dev)
586673
{
@@ -626,7 +713,7 @@ static void vhost_worker_destroy(struct vhost_dev *dev,
626713

627714
WARN_ON(!llist_empty(&worker->work_list));
628715
xa_erase(&dev->worker_xa, worker->id);
629-
vhost_task_stop(worker->vtsk);
716+
worker->ops->stop(worker);
630717
kfree(worker);
631718
}
632719

@@ -649,42 +736,115 @@ static void vhost_workers_free(struct vhost_dev *dev)
649736
xa_destroy(&dev->worker_xa);
650737
}
651738

739+
static void vhost_task_wakeup(struct vhost_worker *worker)
740+
{
741+
return vhost_task_wake(worker->vtsk);
742+
}
743+
744+
static void vhost_kthread_wakeup(struct vhost_worker *worker)
745+
{
746+
wake_up_process(worker->kthread_task);
747+
}
748+
749+
static void vhost_task_do_stop(struct vhost_worker *worker)
750+
{
751+
return vhost_task_stop(worker->vtsk);
752+
}
753+
754+
static void vhost_kthread_do_stop(struct vhost_worker *worker)
755+
{
756+
kthread_stop(worker->kthread_task);
757+
}
758+
759+
static int vhost_task_worker_create(struct vhost_worker *worker,
760+
struct vhost_dev *dev, const char *name)
761+
{
762+
struct vhost_task *vtsk;
763+
u32 id;
764+
int ret;
765+
766+
vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
767+
worker, name);
768+
if (IS_ERR(vtsk))
769+
return PTR_ERR(vtsk);
770+
771+
worker->vtsk = vtsk;
772+
vhost_task_start(vtsk);
773+
ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
774+
if (ret < 0) {
775+
vhost_task_do_stop(worker);
776+
return ret;
777+
}
778+
worker->id = id;
779+
return 0;
780+
}
781+
782+
static int vhost_kthread_worker_create(struct vhost_worker *worker,
783+
struct vhost_dev *dev, const char *name)
784+
{
785+
struct task_struct *task;
786+
u32 id;
787+
int ret;
788+
789+
task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name);
790+
if (IS_ERR(task))
791+
return PTR_ERR(task);
792+
793+
worker->kthread_task = task;
794+
wake_up_process(task);
795+
ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
796+
if (ret < 0)
797+
goto stop_worker;
798+
799+
ret = vhost_attach_task_to_cgroups(worker);
800+
if (ret)
801+
goto stop_worker;
802+
803+
worker->id = id;
804+
return 0;
805+
806+
stop_worker:
807+
vhost_kthread_do_stop(worker);
808+
return ret;
809+
}
810+
811+
static const struct vhost_worker_ops kthread_ops = {
812+
.create = vhost_kthread_worker_create,
813+
.stop = vhost_kthread_do_stop,
814+
.wakeup = vhost_kthread_wakeup,
815+
};
816+
817+
static const struct vhost_worker_ops vhost_task_ops = {
818+
.create = vhost_task_worker_create,
819+
.stop = vhost_task_do_stop,
820+
.wakeup = vhost_task_wakeup,
821+
};
822+
652823
static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
653824
{
654825
struct vhost_worker *worker;
655-
struct vhost_task *vtsk;
656826
char name[TASK_COMM_LEN];
657827
int ret;
658-
u32 id;
828+
const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops :
829+
&kthread_ops;
659830

660831
worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
661832
if (!worker)
662833
return NULL;
663834

664835
worker->dev = dev;
836+
worker->ops = ops;
665837
snprintf(name, sizeof(name), "vhost-%d", current->pid);
666838

667-
vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
668-
worker, name);
669-
if (IS_ERR(vtsk))
670-
goto free_worker;
671-
672839
mutex_init(&worker->mutex);
673840
init_llist_head(&worker->work_list);
674841
worker->kcov_handle = kcov_common_handle();
675-
worker->vtsk = vtsk;
676-
677-
vhost_task_start(vtsk);
678-
679-
ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
842+
ret = ops->create(worker, dev, name);
680843
if (ret < 0)
681-
goto stop_worker;
682-
worker->id = id;
844+
goto free_worker;
683845

684846
return worker;
685847

686-
stop_worker:
687-
vhost_task_stop(vtsk);
688848
free_worker:
689849
kfree(worker);
690850
return NULL;
@@ -865,6 +1025,14 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
8651025
switch (ioctl) {
8661026
/* dev worker ioctls */
8671027
case VHOST_NEW_WORKER:
1028+
/*
1029+
* vhost_tasks will account for worker threads under the parent's
1030+
* NPROC value but kthreads do not. To avoid userspace overflowing
1031+
* the system with worker threads fork_owner must be true.
1032+
*/
1033+
if (!dev->fork_owner)
1034+
return -EFAULT;
1035+
8681036
ret = vhost_new_worker(dev, &state);
8691037
if (!ret && copy_to_user(argp, &state, sizeof(state)))
8701038
ret = -EFAULT;
@@ -982,6 +1150,7 @@ void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
9821150

9831151
vhost_dev_cleanup(dev);
9841152

1153+
dev->fork_owner = fork_from_owner_default;
9851154
dev->umem = umem;
9861155
/* We don't need VQ locks below since vhost_dev_cleanup makes sure
9871156
* VQs aren't running.
@@ -2135,6 +2304,45 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
21352304
goto done;
21362305
}
21372306

2307+
#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
2308+
if (ioctl == VHOST_SET_FORK_FROM_OWNER) {
2309+
/* Only allow modification before owner is set */
2310+
if (vhost_dev_has_owner(d)) {
2311+
r = -EBUSY;
2312+
goto done;
2313+
}
2314+
u8 fork_owner_val;
2315+
2316+
if (get_user(fork_owner_val, (u8 __user *)argp)) {
2317+
r = -EFAULT;
2318+
goto done;
2319+
}
2320+
if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
2321+
fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
2322+
r = -EINVAL;
2323+
goto done;
2324+
}
2325+
d->fork_owner = !!fork_owner_val;
2326+
r = 0;
2327+
goto done;
2328+
}
2329+
if (ioctl == VHOST_GET_FORK_FROM_OWNER) {
2330+
u8 fork_owner_val = d->fork_owner;
2331+
2332+
if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
2333+
fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
2334+
r = -EINVAL;
2335+
goto done;
2336+
}
2337+
if (put_user(fork_owner_val, (u8 __user *)argp)) {
2338+
r = -EFAULT;
2339+
goto done;
2340+
}
2341+
r = 0;
2342+
goto done;
2343+
}
2344+
#endif
2345+
21382346
/* You must be the owner to do anything else */
21392347
r = vhost_dev_check_owner(d);
21402348
if (r)

0 commit comments

Comments
 (0)