Skip to content

Commit a06247c

Browse files
surenbaghdasaryanPeter Zijlstra
authored andcommitted
psi: Fix uaf issue when psi trigger is destroyed while being polled
With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak. Fixes: 0e94682 ("psi: introduce psi monitor") Reported-by: [email protected] Suggested-by: Linus Torvalds <[email protected]> Analyzed-by: Eric Biggers <[email protected]> Signed-off-by: Suren Baghdasaryan <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Eric Biggers <[email protected]> Acked-by: Johannes Weiner <[email protected]> Cc: [email protected] Link: https://lore.kernel.org/r/[email protected]
1 parent fb3b067 commit a06247c

File tree

5 files changed

+40
-45
lines changed

5 files changed

+40
-45
lines changed

Documentation/accounting/psi.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ Triggers can be set on more than one psi metric and more than one trigger
9292
for the same psi metric can be specified. However for each trigger a separate
9393
file descriptor is required to be able to poll it separately from others,
9494
therefore for each trigger a separate open() syscall should be made even
95-
when opening the same psi interface file.
95+
when opening the same psi interface file. Write operations to a file descriptor
96+
with an already existing psi trigger will fail with EBUSY.
9697

9798
Monitors activate only when system enters stall state for the monitored
9899
psi metric and deactivates upon exit from the stall state. While system is

include/linux/psi.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to);
3333

3434
struct psi_trigger *psi_trigger_create(struct psi_group *group,
3535
char *buf, size_t nbytes, enum psi_res res);
36-
void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t);
36+
void psi_trigger_destroy(struct psi_trigger *t);
3737

3838
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
3939
poll_table *wait);

include/linux/psi_types.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,6 @@ struct psi_trigger {
141141
* events to one per window
142142
*/
143143
u64 last_event_time;
144-
145-
/* Refcounting to prevent premature destruction */
146-
struct kref refcount;
147144
};
148145

149146
struct psi_group {

kernel/cgroup/cgroup.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3643,15 +3643,20 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
36433643
cgroup_get(cgrp);
36443644
cgroup_kn_unlock(of->kn);
36453645

3646+
/* Allow only one trigger per file descriptor */
3647+
if (ctx->psi.trigger) {
3648+
cgroup_put(cgrp);
3649+
return -EBUSY;
3650+
}
3651+
36463652
psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
36473653
new = psi_trigger_create(psi, buf, nbytes, res);
36483654
if (IS_ERR(new)) {
36493655
cgroup_put(cgrp);
36503656
return PTR_ERR(new);
36513657
}
36523658

3653-
psi_trigger_replace(&ctx->psi.trigger, new);
3654-
3659+
smp_store_release(&ctx->psi.trigger, new);
36553660
cgroup_put(cgrp);
36563661

36573662
return nbytes;
@@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
36903695
{
36913696
struct cgroup_file_ctx *ctx = of->priv;
36923697

3693-
psi_trigger_replace(&ctx->psi.trigger, NULL);
3698+
psi_trigger_destroy(ctx->psi.trigger);
36943699
}
36953700

36963701
bool cgroup_psi_enabled(void)

kernel/sched/psi.c

Lines changed: 29 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,7 +1162,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
11621162
t->event = 0;
11631163
t->last_event_time = 0;
11641164
init_waitqueue_head(&t->event_wait);
1165-
kref_init(&t->refcount);
11661165

11671166
mutex_lock(&group->trigger_lock);
11681167

@@ -1191,15 +1190,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
11911190
return t;
11921191
}
11931192

1194-
static void psi_trigger_destroy(struct kref *ref)
1193+
void psi_trigger_destroy(struct psi_trigger *t)
11951194
{
1196-
struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
1197-
struct psi_group *group = t->group;
1195+
struct psi_group *group;
11981196
struct task_struct *task_to_destroy = NULL;
11991197

1200-
if (static_branch_likely(&psi_disabled))
1198+
/*
1199+
* We do not check psi_disabled since it might have been disabled after
1200+
* the trigger got created.
1201+
*/
1202+
if (!t)
12011203
return;
12021204

1205+
group = t->group;
12031206
/*
12041207
* Wakeup waiters to stop polling. Can happen if cgroup is deleted
12051208
* from under a polling process.
@@ -1235,9 +1238,9 @@ static void psi_trigger_destroy(struct kref *ref)
12351238
mutex_unlock(&group->trigger_lock);
12361239

12371240
/*
1238-
* Wait for both *trigger_ptr from psi_trigger_replace and
1239-
* poll_task RCUs to complete their read-side critical sections
1240-
* before destroying the trigger and optionally the poll_task
1241+
* Wait for psi_schedule_poll_work RCU to complete its read-side
1242+
* critical section before destroying the trigger and optionally the
1243+
* poll_task.
12411244
*/
12421245
synchronize_rcu();
12431246
/*
@@ -1254,18 +1257,6 @@ static void psi_trigger_destroy(struct kref *ref)
12541257
kfree(t);
12551258
}
12561259

1257-
void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
1258-
{
1259-
struct psi_trigger *old = *trigger_ptr;
1260-
1261-
if (static_branch_likely(&psi_disabled))
1262-
return;
1263-
1264-
rcu_assign_pointer(*trigger_ptr, new);
1265-
if (old)
1266-
kref_put(&old->refcount, psi_trigger_destroy);
1267-
}
1268-
12691260
__poll_t psi_trigger_poll(void **trigger_ptr,
12701261
struct file *file, poll_table *wait)
12711262
{
@@ -1275,24 +1266,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
12751266
if (static_branch_likely(&psi_disabled))
12761267
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
12771268

1278-
rcu_read_lock();
1279-
1280-
t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
1281-
if (!t) {
1282-
rcu_read_unlock();
1269+
t = smp_load_acquire(trigger_ptr);
1270+
if (!t)
12831271
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1284-
}
1285-
kref_get(&t->refcount);
1286-
1287-
rcu_read_unlock();
12881272

12891273
poll_wait(file, &t->event_wait, wait);
12901274

12911275
if (cmpxchg(&t->event, 1, 0) == 1)
12921276
ret |= EPOLLPRI;
12931277

1294-
kref_put(&t->refcount, psi_trigger_destroy);
1295-
12961278
return ret;
12971279
}
12981280

@@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
13161298

13171299
buf[buf_size - 1] = '\0';
13181300

1319-
new = psi_trigger_create(&psi_system, buf, nbytes, res);
1320-
if (IS_ERR(new))
1321-
return PTR_ERR(new);
1322-
13231301
seq = file->private_data;
1302+
13241303
/* Take seq->lock to protect seq->private from concurrent writes */
13251304
mutex_lock(&seq->lock);
1326-
psi_trigger_replace(&seq->private, new);
1305+
1306+
/* Allow only one trigger per file descriptor */
1307+
if (seq->private) {
1308+
mutex_unlock(&seq->lock);
1309+
return -EBUSY;
1310+
}
1311+
1312+
new = psi_trigger_create(&psi_system, buf, nbytes, res);
1313+
if (IS_ERR(new)) {
1314+
mutex_unlock(&seq->lock);
1315+
return PTR_ERR(new);
1316+
}
1317+
1318+
smp_store_release(&seq->private, new);
13271319
mutex_unlock(&seq->lock);
13281320

13291321
return nbytes;
@@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file)
13581350
{
13591351
struct seq_file *seq = file->private_data;
13601352

1361-
psi_trigger_replace(&seq->private, NULL);
1353+
psi_trigger_destroy(seq->private);
13621354
return single_release(inode, file);
13631355
}
13641356

0 commit comments

Comments
 (0)