Skip to content

Commit 7d5265f

Browse files
compudjPeter Zijlstra
authored andcommitted
rseq: Validate read-only fields under DEBUG_RSEQ config
The rseq uapi requires cooperation between users of the rseq fields to ensure that all libraries and applications using rseq within a process do not interfere with each other. This is especially important for fields which are meant to be read-only from user-space, as documented in uapi/linux/rseq.h: - cpu_id_start, - cpu_id, - node_id, - mm_cid. Storing to those fields from a user-space library prevents any sharing of the rseq ABI with other libraries and applications, as other users are not aware that the content of those fields has been altered by a third-party library. This is unfortunately the current behavior of tcmalloc: it purposefully overlaps part of a cached value with the cpu_id_start upper bits to get notified about preemption, because the kernel clears those upper bits before returning to user-space. This behavior does not conform to the rseq uapi header ABI. This prevents tcmalloc from using rseq when rseq is registered by the GNU C library 2.35+. It requires tcmalloc users to disable glibc rseq registration with a glibc tunable, which is a sad state of affairs. Considering that tcmalloc and the GNU C library are the two first upstream projects using rseq, and that they are already incompatible due to use of this hack, adding kernel-level validation of all read-only fields content is necessary to ensure future users of rseq abide by the rseq ABI requirements. Validate that user-space does not corrupt the read-only fields and conform to the rseq uapi header ABI when the kernel is built with CONFIG_DEBUG_RSEQ=y. This is done by storing a copy of the read-only fields in the task_struct, and validating the prior values present in user-space before updating them. If the values do not match, print a warning on the console (printk_ratelimited()). This is a first step to identify misuses of the rseq ABI by printing a warning on the console. After a giving some time to userspace to correct its use of rseq, the plan is to eventually terminate offending processes with SIGSEGV. This change is expected to produce warnings for the upstream tcmalloc implementation, but tcmalloc developers mentioned they were open to adapt their implementation to kernel-level change. Signed-off-by: Mathieu Desnoyers <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: google/tcmalloc#144
1 parent 2a77e4b commit 7d5265f

File tree

2 files changed

+107
-0
lines changed

2 files changed

+107
-0
lines changed

include/linux/sched.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,15 @@ struct task_struct {
13671367
* with respect to preemption.
13681368
*/
13691369
unsigned long rseq_event_mask;
1370+
# ifdef CONFIG_DEBUG_RSEQ
1371+
/*
1372+
* This is a place holder to save a copy of the rseq fields for
1373+
* validation of read-only fields. The struct rseq has a
1374+
* variable-length array at the end, so it cannot be used
1375+
* directly. Reserve a size large enough for the known fields.
1376+
*/
1377+
char rseq_fields[sizeof(struct rseq)];
1378+
# endif
13701379
#endif
13711380

13721381
#ifdef CONFIG_SCHED_MM_CID

kernel/rseq.c

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <linux/syscalls.h>
1414
#include <linux/rseq.h>
1515
#include <linux/types.h>
16+
#include <linux/ratelimit.h>
1617
#include <asm/ptrace.h>
1718

1819
#define CREATE_TRACE_POINTS
@@ -25,6 +26,78 @@
2526
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
2627
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
2728

29+
#ifdef CONFIG_DEBUG_RSEQ
30+
static struct rseq *rseq_kernel_fields(struct task_struct *t)
31+
{
32+
return (struct rseq *) t->rseq_fields;
33+
}
34+
35+
static int rseq_validate_ro_fields(struct task_struct *t)
36+
{
37+
static DEFINE_RATELIMIT_STATE(_rs,
38+
DEFAULT_RATELIMIT_INTERVAL,
39+
DEFAULT_RATELIMIT_BURST);
40+
u32 cpu_id_start, cpu_id, node_id, mm_cid;
41+
struct rseq __user *rseq = t->rseq;
42+
43+
/*
44+
* Validate fields which are required to be read-only by
45+
* user-space.
46+
*/
47+
if (!user_read_access_begin(rseq, t->rseq_len))
48+
goto efault;
49+
unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
50+
unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
51+
unsafe_get_user(node_id, &rseq->node_id, efault_end);
52+
unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
53+
user_read_access_end();
54+
55+
if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
56+
cpu_id != rseq_kernel_fields(t)->cpu_id ||
57+
node_id != rseq_kernel_fields(t)->node_id ||
58+
mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
59+
60+
pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
61+
"\tcpu_id_start: %u ?= %u\n"
62+
"\tcpu_id: %u ?= %u\n"
63+
"\tnode_id: %u ?= %u\n"
64+
"\tmm_cid: %u ?= %u\n",
65+
t->pid, t->comm,
66+
cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
67+
cpu_id, rseq_kernel_fields(t)->cpu_id,
68+
node_id, rseq_kernel_fields(t)->node_id,
69+
mm_cid, rseq_kernel_fields(t)->mm_cid);
70+
}
71+
72+
/* For now, only print a console warning on mismatch. */
73+
return 0;
74+
75+
efault_end:
76+
user_read_access_end();
77+
efault:
78+
return -EFAULT;
79+
}
80+
81+
static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
82+
u32 node_id, u32 mm_cid)
83+
{
84+
rseq_kernel_fields(t)->cpu_id_start = cpu_id;
85+
rseq_kernel_fields(t)->cpu_id = cpu_id;
86+
rseq_kernel_fields(t)->node_id = node_id;
87+
rseq_kernel_fields(t)->mm_cid = mm_cid;
88+
}
89+
#else
90+
static int rseq_validate_ro_fields(struct task_struct *t)
91+
{
92+
return 0;
93+
}
94+
95+
static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
96+
u32 node_id, u32 mm_cid)
97+
{
98+
}
99+
#endif
100+
28101
/*
29102
*
30103
* Restartable sequences are a lightweight interface that allows
@@ -92,6 +165,11 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
92165
u32 node_id = cpu_to_node(cpu_id);
93166
u32 mm_cid = task_mm_cid(t);
94167

168+
/*
169+
* Validate read-only rseq fields.
170+
*/
171+
if (rseq_validate_ro_fields(t))
172+
goto efault;
95173
WARN_ON_ONCE((int) mm_cid < 0);
96174
if (!user_write_access_begin(rseq, t->rseq_len))
97175
goto efault;
@@ -105,6 +183,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
105183
* t->rseq_len != ORIG_RSEQ_SIZE.
106184
*/
107185
user_write_access_end();
186+
rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid);
108187
trace_rseq_update(t);
109188
return 0;
110189

@@ -119,6 +198,11 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
119198
u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
120199
mm_cid = 0;
121200

201+
/*
202+
* Validate read-only rseq fields.
203+
*/
204+
if (!rseq_validate_ro_fields(t))
205+
return -EFAULT;
122206
/*
123207
* Reset cpu_id_start to its initial state (0).
124208
*/
@@ -141,6 +225,9 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
141225
*/
142226
if (put_user(mm_cid, &t->rseq->mm_cid))
143227
return -EFAULT;
228+
229+
rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid);
230+
144231
/*
145232
* Additional feature fields added after ORIG_RSEQ_SIZE
146233
* need to be conditionally reset only if
@@ -423,6 +510,17 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
423510
current->rseq = rseq;
424511
current->rseq_len = rseq_len;
425512
current->rseq_sig = sig;
513+
#ifdef CONFIG_DEBUG_RSEQ
514+
/*
515+
* Initialize the in-kernel rseq fields copy for validation of
516+
* read-only fields.
517+
*/
518+
if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
519+
get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
520+
get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
521+
get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
522+
return -EFAULT;
523+
#endif
426524
/*
427525
* If rseq was previously inactive, and has just been
428526
* registered, ensure the cpu_id_start and cpu_id fields

0 commit comments

Comments
 (0)