Skip to content

Commit e92e255

Browse files
benzeajmberg-intel
authored andcommitted
um: pass FD for memory operations when needed
Instead of always sharing the FDs with the userspace process, only hand over the FDs needed for mmap when required. The idea is that userspace might be able to force the stub into executing an mmap syscall, however, it will not be able to manipulate the control flow sufficiently to have access to an FD that would allow mapping arbitrary memory. Security wise, we need to be sure that only the expected syscalls are executed after the kernel sends FDs through the socket. This is currently not the case, as userspace can trivially jump to the rt_sigreturn syscall instruction to execute any syscall that the stub is permitted to do. With this, it can trick the kernel to send the FD, which in turn allows userspace to freely map any physical memory. As such, this is currently *not* secure. However, in principle the approach should be fine with a more strict SECCOMP filter and a careful review of the stub control flow (as userspace can prepare a stack). With some care, it is likely possible to extend the security model to SMP if desired. Signed-off-by: Benjamin Berg <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Johannes Berg <[email protected]>
1 parent beddc9f commit e92e255

File tree

8 files changed

+280
-60
lines changed

8 files changed

+280
-60
lines changed

arch/um/include/shared/skas/mm_id.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,17 @@
66
#ifndef __MM_ID_H
77
#define __MM_ID_H
88

9+
#define STUB_MAX_FDS 4
10+
911
struct mm_id {
1012
int pid;
1113
unsigned long stack;
1214
int syscall_data_len;
15+
16+
/* Only used with SECCOMP mode */
17+
int sock;
18+
int syscall_fd_num;
19+
int syscall_fd_map[STUB_MAX_FDS];
1320
};
1421

1522
void __switch_mm(struct mm_id *mm_idp);

arch/um/include/shared/skas/stub-data.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <as-layout.h>
1313
#include <sysdep/tls.h>
1414
#include <sysdep/stub-data.h>
15+
#include <mm_id.h>
1516

1617
#define FUTEX_IN_CHILD 0
1718
#define FUTEX_IN_KERN 1

arch/um/kernel/skas/mmu.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ void destroy_context(struct mm_struct *mm)
7878
mmu->id.pid = -1;
7979
}
8080

81+
if (using_seccomp && mmu->id.sock)
82+
os_close_file(mmu->id.sock);
83+
8184
free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES));
8285

8386
guard(spinlock_irqsave)(&mm_list_lock);

arch/um/kernel/skas/stub.c

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,53 @@
66
#include <sysdep/stub.h>
77

88
#include <linux/futex.h>
9+
#include <sys/socket.h>
910
#include <errno.h>
1011

11-
static __always_inline int syscall_handler(struct stub_data *d)
12+
/*
13+
* Known security issues
14+
*
15+
* Userspace can jump to this address to execute *any* syscall that is
16+
* permitted by the stub. As we will return afterwards, it can do
17+
* whatever it likes, including:
18+
* - Tricking the kernel into handing out the memory FD
19+
* - Using this memory FD to read/write all physical memory
20+
* - Running in parallel to the kernel processing a syscall
21+
* (possibly creating data races?)
22+
* - Blocking e.g. SIGALRM to avoid time based scheduling
23+
*
24+
* To avoid this, the permitted location for each syscall needs to be
25+
* checked for in the SECCOMP filter (which is reasonably simple). Also,
26+
* more care will need to go into considerations how the code might be
27+
* tricked by using a prepared stack (or even modifying the stack from
28+
* another thread in case SMP support is added).
29+
*
30+
* As for the SIGALRM, the best counter measure will be to check in the
31+
* kernel that the process is reporting back the SIGALRM in a timely
32+
* fashion.
33+
*/
34+
static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS])
1235
{
36+
struct stub_data *d = get_stub_data();
1337
int i;
1438
unsigned long res;
39+
int fd;
1540

1641
for (i = 0; i < d->syscall_data_len; i++) {
1742
struct stub_syscall *sc = &d->syscall_data[i];
1843

1944
switch (sc->syscall) {
2045
case STUB_SYSCALL_MMAP:
46+
if (fd_map)
47+
fd = fd_map[sc->mem.fd];
48+
else
49+
fd = sc->mem.fd;
50+
2151
res = stub_syscall6(STUB_MMAP_NR,
2252
sc->mem.addr, sc->mem.length,
2353
sc->mem.prot,
2454
MAP_SHARED | MAP_FIXED,
25-
sc->mem.fd, sc->mem.offset);
55+
fd, sc->mem.offset);
2656
if (res != sc->mem.addr) {
2757
d->err = res;
2858
d->syscall_data_len = i;
@@ -54,9 +84,7 @@ static __always_inline int syscall_handler(struct stub_data *d)
5484
void __section(".__syscall_stub")
5585
stub_syscall_handler(void)
5686
{
57-
struct stub_data *d = get_stub_data();
58-
59-
syscall_handler(d);
87+
syscall_handler(NULL);
6088

6189
trap_myself();
6290
}
@@ -65,7 +93,25 @@ void __section(".__syscall_stub")
6593
stub_signal_interrupt(int sig, siginfo_t *info, void *p)
6694
{
6795
struct stub_data *d = get_stub_data();
96+
char rcv_data;
97+
union {
98+
char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)];
99+
struct cmsghdr align;
100+
} ctrl = {};
101+
struct iovec iov = {
102+
.iov_base = &rcv_data,
103+
.iov_len = 1,
104+
};
105+
struct msghdr msghdr = {
106+
.msg_iov = &iov,
107+
.msg_iovlen = 1,
108+
.msg_control = &ctrl,
109+
.msg_controllen = sizeof(ctrl),
110+
};
68111
ucontext_t *uc = p;
112+
struct cmsghdr *fd_msg;
113+
int *fd_map;
114+
int num_fds;
69115
long res;
70116

71117
d->signal = sig;
@@ -78,6 +124,7 @@ stub_signal_interrupt(int sig, siginfo_t *info, void *p)
78124
res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
79125
FUTEX_WAKE, 1);
80126
} while (res == -EINTR);
127+
81128
do {
82129
res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
83130
FUTEX_WAIT, FUTEX_IN_KERN, 0);
@@ -86,11 +133,37 @@ stub_signal_interrupt(int sig, siginfo_t *info, void *p)
86133
if (res < 0 && res != -EAGAIN)
87134
stub_syscall1(__NR_exit_group, 1);
88135

89-
/* Try running queued syscalls. */
90-
if (syscall_handler(d) < 0 || d->restart_wait) {
136+
if (d->syscall_data_len) {
137+
/* Read passed FDs (if any) */
138+
do {
139+
res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0);
140+
} while (res == -EINTR);
141+
142+
/* We should never have a receive error (other than -EAGAIN) */
143+
if (res < 0 && res != -EAGAIN)
144+
stub_syscall1(__NR_exit_group, 1);
145+
146+
/* Receive the FDs */
147+
num_fds = 0;
148+
fd_msg = msghdr.msg_control;
149+
fd_map = (void *)&CMSG_DATA(fd_msg);
150+
if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr))
151+
num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
152+
153+
/* Try running queued syscalls. */
154+
res = syscall_handler(fd_map);
155+
156+
while (num_fds)
157+
stub_syscall2(__NR_close, fd_map[--num_fds], 0);
158+
} else {
159+
res = 0;
160+
}
161+
162+
if (res < 0 || d->restart_wait) {
91163
/* Report SIGSYS if we restart. */
92164
d->signal = SIGSYS;
93165
d->restart_wait = 0;
166+
94167
goto restart_wait;
95168
}
96169

arch/um/kernel/skas/stub_exe.c

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <sys/ptrace.h>
22
#include <sys/prctl.h>
3+
#include <sys/fcntl.h>
34
#include <asm/unistd.h>
45
#include <sysdep/stub.h>
56
#include <stub-data.h>
@@ -45,7 +46,11 @@ noinline static void real_init(void)
4546
if (res != sizeof(init_data))
4647
stub_syscall1(__NR_exit, 10);
4748

48-
stub_syscall1(__NR_close, 0);
49+
/* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */
50+
if (!init_data.seccomp)
51+
stub_syscall1(__NR_close, 0);
52+
else
53+
stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK);
4954

5055
/* map stub code + data */
5156
res = stub_syscall6(STUB_MMAP_NR,
@@ -63,6 +68,13 @@ noinline static void real_init(void)
6368
if (res != init_data.stub_start + UM_KERN_PAGE_SIZE)
6469
stub_syscall1(__NR_exit, 12);
6570

71+
/* In SECCOMP mode, we only need the signalling FD from now on */
72+
if (init_data.seccomp) {
73+
res = stub_syscall3(__NR_close_range, 1, ~0U, 0);
74+
if (res != 0)
75+
stub_syscall1(__NR_exit, 13);
76+
}
77+
6678
/* setup signal stack inside stub data */
6779
stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
6880
stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
@@ -77,40 +89,40 @@ noinline static void real_init(void)
7789
res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
7890
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
7991
if (res != 0)
80-
stub_syscall1(__NR_exit, 13);
92+
stub_syscall1(__NR_exit, 14);
8193
} else {
8294
/* SECCOMP mode uses rt_sigreturn, need to mask all signals */
8395
sa.sa_mask = ~0ULL;
8496

8597
res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
8698
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
8799
if (res != 0)
88-
stub_syscall1(__NR_exit, 14);
100+
stub_syscall1(__NR_exit, 15);
89101

90102
res = stub_syscall4(__NR_rt_sigaction, SIGSYS,
91103
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
92104
if (res != 0)
93-
stub_syscall1(__NR_exit, 15);
105+
stub_syscall1(__NR_exit, 16);
94106

95107
res = stub_syscall4(__NR_rt_sigaction, SIGALRM,
96108
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
97109
if (res != 0)
98-
stub_syscall1(__NR_exit, 16);
110+
stub_syscall1(__NR_exit, 17);
99111

100112
res = stub_syscall4(__NR_rt_sigaction, SIGTRAP,
101113
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
102114
if (res != 0)
103-
stub_syscall1(__NR_exit, 17);
115+
stub_syscall1(__NR_exit, 18);
104116

105117
res = stub_syscall4(__NR_rt_sigaction, SIGILL,
106118
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
107119
if (res != 0)
108-
stub_syscall1(__NR_exit, 18);
120+
stub_syscall1(__NR_exit, 19);
109121

110122
res = stub_syscall4(__NR_rt_sigaction, SIGFPE,
111123
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
112124
if (res != 0)
113-
stub_syscall1(__NR_exit, 19);
125+
stub_syscall1(__NR_exit, 20);
114126
}
115127

116128
/*
@@ -153,8 +165,12 @@ noinline static void real_init(void)
153165
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
154166
offsetof(struct seccomp_data, nr)),
155167

156-
/* [10-14] Check against permitted syscalls */
168+
/* [10-16] Check against permitted syscalls */
157169
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
170+
7, 0),
171+
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg,
172+
6, 0),
173+
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close,
158174
5, 0),
159175
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
160176
4, 0),
@@ -170,10 +186,10 @@ noinline static void real_init(void)
170186
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
171187
1, 0),
172188

173-
/* [15] Not one of the permitted syscalls */
189+
/* [17] Not one of the permitted syscalls */
174190
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
175191

176-
/* [16] Permitted call for the stub */
192+
/* [18] Permitted call for the stub */
177193
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
178194
};
179195
struct sock_fprog prog = {
@@ -184,7 +200,7 @@ noinline static void real_init(void)
184200
if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
185201
SECCOMP_FILTER_FLAG_TSYNC,
186202
(unsigned long)&prog) != 0)
187-
stub_syscall1(__NR_exit, 20);
203+
stub_syscall1(__NR_exit, 21);
188204

189205
/* Fall through, the exit syscall will cause SIGSYS */
190206
} else {

arch/um/os-Linux/skas/mem.c

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,16 @@ void syscall_stub_dump_error(struct mm_id *mm_idp)
4343

4444
print_hex_dump(UM_KERN_ERR, " syscall data: ", 0,
4545
16, 4, sc, sizeof(*sc), 0);
46+
47+
if (using_seccomp) {
48+
printk(UM_KERN_ERR "%s: FD map num: %d", __func__,
49+
mm_idp->syscall_fd_num);
50+
print_hex_dump(UM_KERN_ERR,
51+
" FD map: ", 0, 16,
52+
sizeof(mm_idp->syscall_fd_map[0]),
53+
mm_idp->syscall_fd_map,
54+
sizeof(mm_idp->syscall_fd_map), 0);
55+
}
4656
}
4757

4858
static inline unsigned long *check_init_stack(struct mm_id * mm_idp,
@@ -118,6 +128,9 @@ static inline long do_syscall_stub(struct mm_id *mm_idp)
118128
mm_idp->syscall_data_len = 0;
119129
}
120130

131+
if (using_seccomp)
132+
mm_idp->syscall_fd_num = 0;
133+
121134
return mm_idp->syscall_data_len;
122135
}
123136

@@ -180,19 +193,66 @@ static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp,
180193
return NULL;
181194
}
182195

196+
static int get_stub_fd(struct mm_id *mm_idp, int fd)
197+
{
198+
int i;
199+
200+
/* Find an FD slot (or flush and use first) */
201+
if (!using_seccomp)
202+
return fd;
203+
204+
/* Already crashed, value does not matter */
205+
if (mm_idp->syscall_data_len < 0)
206+
return 0;
207+
208+
/* Find existing FD in map if we can allocate another syscall */
209+
if (mm_idp->syscall_data_len <
210+
ARRAY_SIZE(((struct stub_data *)NULL)->syscall_data)) {
211+
for (i = 0; i < mm_idp->syscall_fd_num; i++) {
212+
if (mm_idp->syscall_fd_map[i] == fd)
213+
return i;
214+
}
215+
216+
if (mm_idp->syscall_fd_num < STUB_MAX_FDS) {
217+
i = mm_idp->syscall_fd_num;
218+
mm_idp->syscall_fd_map[i] = fd;
219+
220+
mm_idp->syscall_fd_num++;
221+
222+
return i;
223+
}
224+
}
225+
226+
/* FD map full or no syscall space available, continue after flush */
227+
do_syscall_stub(mm_idp);
228+
mm_idp->syscall_fd_map[0] = fd;
229+
mm_idp->syscall_fd_num = 1;
230+
231+
return 0;
232+
}
233+
183234
int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot,
184235
int phys_fd, unsigned long long offset)
185236
{
186237
struct stub_syscall *sc;
187238

188239
/* Compress with previous syscall if that is possible */
189240
sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MMAP, virt);
190-
if (sc && sc->mem.prot == prot && sc->mem.fd == phys_fd &&
241+
if (sc && sc->mem.prot == prot &&
191242
sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) {
192-
sc->mem.length += len;
193-
return 0;
243+
int prev_fd = sc->mem.fd;
244+
245+
if (using_seccomp)
246+
prev_fd = mm_idp->syscall_fd_map[sc->mem.fd];
247+
248+
if (phys_fd == prev_fd) {
249+
sc->mem.length += len;
250+
return 0;
251+
}
194252
}
195253

254+
phys_fd = get_stub_fd(mm_idp, phys_fd);
255+
196256
sc = syscall_stub_alloc(mm_idp);
197257
sc->syscall = STUB_SYSCALL_MMAP;
198258
sc->mem.addr = virt;

0 commit comments

Comments
 (0)