Skip to content

Commit 4c30f5c

Browse files
committed
sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()
Once a task is put into a DSQ, the allowed operations are fairly limited. Tasks in the built-in local and global DSQs are executed automatically and, ignoring dequeue, there is only one way a task in a user DSQ can be manipulated - scx_bpf_consume() moves the first task to the dispatching local DSQ. This inflexibility sometimes gets in the way and is an area where multiple feature requests have been made. Implement scx_bpf_dispatch[_vtime]_from_dsq(), which can be called during DSQ iteration and can move the task to any DSQ - local DSQs, global DSQ and user DSQs. The kfuncs can be called from ops.dispatch() and any BPF context which dosen't hold a rq lock including BPF timers and SYSCALL programs. This is an expansion of an earlier patch which only allowed moving into the dispatching local DSQ: http://lkml.kernel.org/r/[email protected] v2: Remove @slice and @VTime from scx_bpf_dispatch_from_dsq[_vtime]() as they push scx_bpf_dispatch_from_dsq_vtime() over the kfunc argument count limit and often won't be needed anyway. Instead provide scx_bpf_dispatch_from_dsq_set_{slice|vtime}() kfuncs which can be called only when needed and override the specified parameter for the subsequent dispatch. Signed-off-by: Tejun Heo <[email protected]> Cc: Daniel Hodges <[email protected]> Cc: David Vernet <[email protected]> Cc: Changwoo Min <[email protected]> Cc: Andrea Righi <[email protected]> Cc: Dan Schatzberg <[email protected]>
1 parent 6462dd5 commit 4c30f5c

File tree

2 files changed

+239
-3
lines changed

2 files changed

+239
-3
lines changed

kernel/sched/ext.c

Lines changed: 229 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1158,6 +1158,11 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
11581158
return true;
11591159
}
11601160

1161+
static bool scx_kf_allowed_if_unlocked(void)
1162+
{
1163+
return !current->scx.kf_mask;
1164+
}
1165+
11611166
/**
11621167
* nldsq_next_task - Iterate to the next task in a non-local DSQ
11631168
* @dsq: user dsq being interated
@@ -1211,13 +1216,20 @@ enum scx_dsq_iter_flags {
12111216
/* iterate in the reverse dispatch order */
12121217
SCX_DSQ_ITER_REV = 1U << 16,
12131218

1219+
__SCX_DSQ_ITER_HAS_SLICE = 1U << 30,
1220+
__SCX_DSQ_ITER_HAS_VTIME = 1U << 31,
1221+
12141222
__SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV,
1215-
__SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS,
1223+
__SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS |
1224+
__SCX_DSQ_ITER_HAS_SLICE |
1225+
__SCX_DSQ_ITER_HAS_VTIME,
12161226
};
12171227

12181228
struct bpf_iter_scx_dsq_kern {
12191229
struct scx_dsq_list_node cursor;
12201230
struct scx_dispatch_q *dsq;
1231+
u64 slice;
1232+
u64 vtime;
12211233
} __attribute__((aligned(8)));
12221234

12231235
struct bpf_iter_scx_dsq {
@@ -5872,7 +5884,7 @@ __bpf_kfunc_start_defs();
58725884
* scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
58735885
* @p: task_struct to dispatch
58745886
* @dsq_id: DSQ to dispatch to
5875-
* @slice: duration @p can run for in nsecs
5887+
* @slice: duration @p can run for in nsecs, 0 to keep the current value
58765888
* @enq_flags: SCX_ENQ_*
58775889
*
58785890
* Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
@@ -5922,7 +5934,7 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
59225934
* scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
59235935
* @p: task_struct to dispatch
59245936
* @dsq_id: DSQ to dispatch to
5925-
* @slice: duration @p can run for in nsecs
5937+
* @slice: duration @p can run for in nsecs, 0 to keep the current value
59265938
* @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
59275939
* @enq_flags: SCX_ENQ_*
59285940
*
@@ -5963,6 +5975,118 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
59635975
.set = &scx_kfunc_ids_enqueue_dispatch,
59645976
};
59655977

5978+
static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
5979+
struct task_struct *p, u64 dsq_id,
5980+
u64 enq_flags)
5981+
{
5982+
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
5983+
struct rq *this_rq, *src_rq, *dst_rq, *locked_rq;
5984+
bool dispatched = false;
5985+
bool in_balance;
5986+
unsigned long flags;
5987+
5988+
if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
5989+
return false;
5990+
5991+
/*
5992+
* Can be called from either ops.dispatch() locking this_rq() or any
5993+
* context where no rq lock is held. If latter, lock @p's task_rq which
5994+
* we'll likely need anyway.
5995+
*/
5996+
src_rq = task_rq(p);
5997+
5998+
local_irq_save(flags);
5999+
this_rq = this_rq();
6000+
in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
6001+
6002+
if (in_balance) {
6003+
if (this_rq != src_rq) {
6004+
raw_spin_rq_unlock(this_rq);
6005+
raw_spin_rq_lock(src_rq);
6006+
}
6007+
} else {
6008+
raw_spin_rq_lock(src_rq);
6009+
}
6010+
6011+
locked_rq = src_rq;
6012+
raw_spin_lock(&src_dsq->lock);
6013+
6014+
/*
6015+
* Did someone else get to it? @p could have already left $src_dsq, got
6016+
* re-enqueud, or be in the process of being consumed by someone else.
6017+
*/
6018+
if (unlikely(p->scx.dsq != src_dsq ||
6019+
u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
6020+
p->scx.holding_cpu >= 0) ||
6021+
WARN_ON_ONCE(src_rq != task_rq(p))) {
6022+
raw_spin_unlock(&src_dsq->lock);
6023+
goto out;
6024+
}
6025+
6026+
/* @p is still on $src_dsq and stable, determine the destination */
6027+
dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
6028+
6029+
if (dst_dsq->id == SCX_DSQ_LOCAL) {
6030+
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
6031+
if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
6032+
dst_dsq = &scx_dsq_global;
6033+
dst_rq = src_rq;
6034+
}
6035+
} else {
6036+
/* no need to migrate if destination is a non-local DSQ */
6037+
dst_rq = src_rq;
6038+
}
6039+
6040+
/*
6041+
* Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
6042+
* CPU, @p will be migrated.
6043+
*/
6044+
if (dst_dsq->id == SCX_DSQ_LOCAL) {
6045+
/* @p is going from a non-local DSQ to a local DSQ */
6046+
if (src_rq == dst_rq) {
6047+
task_unlink_from_dsq(p, src_dsq);
6048+
move_local_task_to_local_dsq(p, enq_flags,
6049+
src_dsq, dst_rq);
6050+
raw_spin_unlock(&src_dsq->lock);
6051+
} else {
6052+
raw_spin_unlock(&src_dsq->lock);
6053+
move_remote_task_to_local_dsq(p, enq_flags,
6054+
src_rq, dst_rq);
6055+
locked_rq = dst_rq;
6056+
}
6057+
} else {
6058+
/*
6059+
* @p is going from a non-local DSQ to a non-local DSQ. As
6060+
* $src_dsq is already locked, do an abbreviated dequeue.
6061+
*/
6062+
task_unlink_from_dsq(p, src_dsq);
6063+
p->scx.dsq = NULL;
6064+
raw_spin_unlock(&src_dsq->lock);
6065+
6066+
if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
6067+
p->scx.dsq_vtime = kit->vtime;
6068+
dispatch_enqueue(dst_dsq, p, enq_flags);
6069+
}
6070+
6071+
if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
6072+
p->scx.slice = kit->slice;
6073+
6074+
dispatched = true;
6075+
out:
6076+
if (in_balance) {
6077+
if (this_rq != locked_rq) {
6078+
raw_spin_rq_unlock(locked_rq);
6079+
raw_spin_rq_lock(this_rq);
6080+
}
6081+
} else {
6082+
raw_spin_rq_unlock_irqrestore(locked_rq, flags);
6083+
}
6084+
6085+
kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
6086+
__SCX_DSQ_ITER_HAS_VTIME);
6087+
return dispatched;
6088+
}
6089+
59666090
__bpf_kfunc_start_defs();
59676091

59686092
/**
@@ -6042,12 +6166,112 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
60426166
}
60436167
}
60446168

6169+
/**
6170+
* scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ
6171+
* @it__iter: DSQ iterator in progress
6172+
* @slice: duration the dispatched task can run for in nsecs
6173+
*
6174+
* Override the slice of the next task that will be dispatched from @it__iter
6175+
* using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called,
6176+
* the previous slice duration is kept.
6177+
*/
6178+
__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
6179+
struct bpf_iter_scx_dsq *it__iter, u64 slice)
6180+
{
6181+
struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
6182+
6183+
kit->slice = slice;
6184+
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
6185+
}
6186+
6187+
/**
6188+
* scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ
6189+
* @it__iter: DSQ iterator in progress
6190+
* @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
6191+
*
6192+
* Override the vtime of the next task that will be dispatched from @it__iter
6193+
* using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the
6194+
* previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to
6195+
* dispatch the next task, the override is ignored and cleared.
6196+
*/
6197+
__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
6198+
struct bpf_iter_scx_dsq *it__iter, u64 vtime)
6199+
{
6200+
struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
6201+
6202+
kit->vtime = vtime;
6203+
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
6204+
}
6205+
6206+
/**
6207+
* scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ
6208+
* @it__iter: DSQ iterator in progress
6209+
* @p: task to transfer
6210+
* @dsq_id: DSQ to move @p to
6211+
* @enq_flags: SCX_ENQ_*
6212+
*
6213+
* Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ
6214+
* specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
6215+
* be the destination.
6216+
*
6217+
* For the transfer to be successful, @p must still be on the DSQ and have been
6218+
* queued before the DSQ iteration started. This function doesn't care whether
6219+
* @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
6220+
* been queued before the iteration started.
6221+
*
6222+
* @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to
6223+
* update.
6224+
*
6225+
* Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
6226+
* lock (e.g. BPF timers or SYSCALL programs).
6227+
*
6228+
* Returns %true if @p has been consumed, %false if @p had already been consumed
6229+
* or dequeued.
6230+
*/
6231+
__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
6232+
struct task_struct *p, u64 dsq_id,
6233+
u64 enq_flags)
6234+
{
6235+
return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
6236+
p, dsq_id, enq_flags);
6237+
}
6238+
6239+
/**
6240+
* scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ
6241+
* @it__iter: DSQ iterator in progress
6242+
* @p: task to transfer
6243+
* @dsq_id: DSQ to move @p to
6244+
* @enq_flags: SCX_ENQ_*
6245+
*
6246+
* Transfer @p which is on the DSQ currently iterated by @it__iter to the
6247+
* priority queue of the DSQ specified by @dsq_id. The destination must be a
6248+
* user DSQ as only user DSQs support priority queue.
6249+
*
6250+
* @p's slice and vtime are kept by default. Use
6251+
* scx_bpf_dispatch_from_dsq_set_slice() and
6252+
* scx_bpf_dispatch_from_dsq_set_vtime() to update.
6253+
*
6254+
* All other aspects are identical to scx_bpf_dispatch_from_dsq(). See
6255+
* scx_bpf_dispatch_vtime() for more information on @vtime.
6256+
*/
6257+
__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
6258+
struct task_struct *p, u64 dsq_id,
6259+
u64 enq_flags)
6260+
{
6261+
return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
6262+
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
6263+
}
6264+
60456265
__bpf_kfunc_end_defs();
60466266

60476267
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
60486268
BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
60496269
BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
60506270
BTF_ID_FLAGS(func, scx_bpf_consume)
6271+
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
6272+
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
6273+
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
6274+
BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
60516275
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
60526276

60536277
static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
@@ -6144,6 +6368,8 @@ __bpf_kfunc_end_defs();
61446368

61456369
BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
61466370
BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
6371+
BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
6372+
BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
61476373
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
61486374

61496375
static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {

tools/sched_ext/include/scx/common.bpf.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt
3535
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
3636
void scx_bpf_dispatch_cancel(void) __ksym;
3737
bool scx_bpf_consume(u64 dsq_id) __ksym;
38+
void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
39+
void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
40+
bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
41+
bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
3842
u32 scx_bpf_reenqueue_local(void) __ksym;
3943
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
4044
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
@@ -63,6 +67,12 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
6367
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
6468
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
6569

70+
/*
71+
* Use the following as @it__iter when calling
72+
* scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops.
73+
*/
74+
#define BPF_FOR_EACH_ITER (&___it)
75+
6676
static inline __attribute__((format(printf, 1, 2)))
6777
void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
6878

0 commit comments

Comments
 (0)