Skip to content

Commit 0a72761

Browse files
committed
Merge tag 'threads-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull thread updates from Christian Brauner: "This contains the changes to add the missing support for attaching to time namespaces via pidfds. Last cycle setns() was changed to support attaching to multiple namespaces atomically. This requires all namespaces to have a point of no return where they can't fail anymore. Specifically, <namespace-type>_install() is allowed to perform permission checks and install the namespace into the new struct nsset that it has been given but it is not allowed to make visible changes to the affected task. Once <namespace-type>_install() returns, anything that the given namespace type additionally requires to be setup needs to ideally be done in a function that can't fail or if it fails the failure must be non-fatal. For time namespaces the relevant functions that fell into this category were timens_set_vvar_page() and vdso_join_timens(). The latter could still fail although it didn't need to. This function is only implemented for vdso_join_timens() in current mainline. As discussed on-list (cf. [1]), in order to make setns() support time namespaces when attaching to multiple namespaces at once properly we changed vdso_join_timens() to always succeed. So vdso_join_timens() replaces the mmap_write_lock_killable() with mmap_read_lock(). Please note that arm is about to grow vdso support for time namespaces (possibly this merge window). We've synced on this change and arm64 also uses mmap_read_lock(), i.e. makes vdso_join_timens() a function that can't fail. Once the changes here and the arm64 changes have landed, vdso_join_timens() should be turned into a void function so it's obvious to callers and implementers on other architectures that the expectation is that it can't fail. We didn't do this right away because it would've introduced unnecessary merge conflicts between the two trees for no major gain. As always, tests included" [1]: https://lore.kernel.org/lkml/20200611110221.pgd3r5qkjrjmfqa2@wittgenstein * tag 'threads-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: tests: add CLONE_NEWTIME setns tests nsproxy: support CLONE_NEWTIME with setns() timens: add timens_commit() helper timens: make vdso_join_timens() always succeed
2 parents 3950e97 + 55d9ad9 commit 0a72761

File tree

6 files changed

+115
-19
lines changed

6 files changed

+115
-19
lines changed

arch/x86/entry/vdso/vma.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
144144
struct mm_struct *mm = task->mm;
145145
struct vm_area_struct *vma;
146146

147-
if (mmap_write_lock_killable(mm))
148-
return -EINTR;
147+
mmap_read_lock(mm);
149148

150149
for (vma = mm->mmap; vma; vma = vma->vm_next) {
151150
unsigned long size = vma->vm_end - vma->vm_start;
@@ -154,7 +153,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
154153
zap_page_range(vma, vma->vm_start, size);
155154
}
156155

157-
mmap_write_unlock(mm);
156+
mmap_read_unlock(mm);
158157
return 0;
159158
}
160159
#else

include/linux/time_namespace.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ extern struct time_namespace init_time_ns;
3333
#ifdef CONFIG_TIME_NS
3434
extern int vdso_join_timens(struct task_struct *task,
3535
struct time_namespace *ns);
36+
extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);
3637

3738
static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
3839
{
@@ -96,6 +97,11 @@ static inline int vdso_join_timens(struct task_struct *task,
9697
return 0;
9798
}
9899

100+
static inline void timens_commit(struct task_struct *tsk,
101+
struct time_namespace *ns)
102+
{
103+
}
104+
99105
static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
100106
{
101107
return NULL;

kernel/nsproxy.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,8 @@ void exit_task_namespaces(struct task_struct *p)
262262
static int check_setns_flags(unsigned long flags)
263263
{
264264
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
265-
CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
266-
CLONE_NEWCGROUP)))
265+
CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
266+
CLONE_NEWPID | CLONE_NEWCGROUP)))
267267
return -EINVAL;
268268

269269
#ifndef CONFIG_USER_NS
@@ -290,6 +290,10 @@ static int check_setns_flags(unsigned long flags)
290290
if (flags & CLONE_NEWNET)
291291
return -EINVAL;
292292
#endif
293+
#ifndef CONFIG_TIME_NS
294+
if (flags & CLONE_NEWTIME)
295+
return -EINVAL;
296+
#endif
293297

294298
return 0;
295299
}
@@ -464,6 +468,14 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid)
464468
}
465469
#endif
466470

471+
#ifdef CONFIG_TIME_NS
472+
if (flags & CLONE_NEWTIME) {
473+
ret = validate_ns(nsset, &nsp->time_ns->ns);
474+
if (ret)
475+
goto out;
476+
}
477+
#endif
478+
467479
out:
468480
if (pid_ns)
469481
put_pid_ns(pid_ns);
@@ -507,6 +519,11 @@ static void commit_nsset(struct nsset *nsset)
507519
exit_sem(me);
508520
#endif
509521

522+
#ifdef CONFIG_TIME_NS
523+
if (flags & CLONE_NEWTIME)
524+
timens_commit(me, nsset->nsproxy->time_ns);
525+
#endif
526+
510527
/* transfer ownership */
511528
switch_task_namespaces(me, nsset->nsproxy);
512529
nsset->nsproxy = NULL;

kernel/time/namespace.c

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -280,11 +280,16 @@ static void timens_put(struct ns_common *ns)
280280
put_time_ns(to_time_ns(ns));
281281
}
282282

283+
void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
284+
{
285+
timens_set_vvar_page(tsk, ns);
286+
vdso_join_timens(tsk, ns);
287+
}
288+
283289
static int timens_install(struct nsset *nsset, struct ns_common *new)
284290
{
285291
struct nsproxy *nsproxy = nsset->nsproxy;
286292
struct time_namespace *ns = to_time_ns(new);
287-
int err;
288293

289294
if (!current_is_single_threaded())
290295
return -EUSERS;
@@ -293,12 +298,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
293298
!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
294299
return -EPERM;
295300

296-
timens_set_vvar_page(current, ns);
297-
298-
err = vdso_join_timens(current, ns);
299-
if (err)
300-
return err;
301-
302301
get_time_ns(ns);
303302
put_time_ns(nsproxy->time_ns);
304303
nsproxy->time_ns = ns;
@@ -313,22 +312,17 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
313312
{
314313
struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
315314
struct time_namespace *ns = to_time_ns(nsc);
316-
int err;
317315

318316
/* create_new_namespaces() already incremented the ref counter */
319317
if (nsproxy->time_ns == nsproxy->time_ns_for_children)
320318
return 0;
321319

322-
timens_set_vvar_page(tsk, ns);
323-
324-
err = vdso_join_timens(tsk, ns);
325-
if (err)
326-
return err;
327-
328320
get_time_ns(ns);
329321
put_time_ns(nsproxy->time_ns);
330322
nsproxy->time_ns = ns;
331323

324+
timens_commit(tsk, ns);
325+
332326
return 0;
333327
}
334328

tools/testing/selftests/pidfd/pidfd.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@
2222
#define P_PIDFD 3
2323
#endif
2424

25+
#ifndef CLONE_NEWTIME
26+
#define CLONE_NEWTIME 0x00000080
27+
#endif
28+
2529
#ifndef CLONE_PIDFD
2630
#define CLONE_PIDFD 0x00001000
2731
#endif

tools/testing/selftests/pidfd/pidfd_setns_test.c

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ enum {
3232
PIDFD_NS_NET,
3333
PIDFD_NS_CGROUP,
3434
PIDFD_NS_PIDCLD,
35+
PIDFD_NS_TIME,
3536
PIDFD_NS_MAX
3637
};
3738

@@ -47,6 +48,7 @@ const struct ns_info {
4748
[PIDFD_NS_NET] = { "net", CLONE_NEWNET, },
4849
[PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, },
4950
[PIDFD_NS_PIDCLD] = { "pid_for_children", 0, },
51+
[PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, },
5052
};
5153

5254
FIXTURE(current_nsset)
@@ -83,9 +85,49 @@ pid_t create_child(int *pidfd, unsigned flags)
8385
return sys_clone3(&args, sizeof(struct clone_args));
8486
}
8587

88+
static bool switch_timens(void)
89+
{
90+
int fd, ret;
91+
92+
if (unshare(CLONE_NEWTIME))
93+
return false;
94+
95+
fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
96+
if (fd < 0)
97+
return false;
98+
99+
ret = setns(fd, CLONE_NEWTIME);
100+
close(fd);
101+
return ret == 0;
102+
}
103+
104+
static ssize_t read_nointr(int fd, void *buf, size_t count)
105+
{
106+
ssize_t ret;
107+
108+
do {
109+
ret = read(fd, buf, count);
110+
} while (ret < 0 && errno == EINTR);
111+
112+
return ret;
113+
}
114+
115+
static ssize_t write_nointr(int fd, const void *buf, size_t count)
116+
{
117+
ssize_t ret;
118+
119+
do {
120+
ret = write(fd, buf, count);
121+
} while (ret < 0 && errno == EINTR);
122+
123+
return ret;
124+
}
125+
86126
FIXTURE_SETUP(current_nsset)
87127
{
88128
int i, proc_fd, ret;
129+
int ipc_sockets[2];
130+
char c;
89131

90132
for (i = 0; i < PIDFD_NS_MAX; i++) {
91133
self->nsfds[i] = -EBADF;
@@ -130,6 +172,9 @@ FIXTURE_SETUP(current_nsset)
130172
TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
131173
}
132174

175+
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
176+
EXPECT_EQ(ret, 0);
177+
133178
/* Create tasks that will be stopped. */
134179
self->child_pid1 = create_child(&self->child_pidfd1,
135180
CLONE_NEWUSER | CLONE_NEWNS |
@@ -139,10 +184,27 @@ FIXTURE_SETUP(current_nsset)
139184
EXPECT_GE(self->child_pid1, 0);
140185

141186
if (self->child_pid1 == 0) {
187+
close(ipc_sockets[0]);
188+
189+
if (!switch_timens())
190+
_exit(EXIT_FAILURE);
191+
192+
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
193+
_exit(EXIT_FAILURE);
194+
195+
close(ipc_sockets[1]);
196+
142197
pause();
143198
_exit(EXIT_SUCCESS);
144199
}
145200

201+
close(ipc_sockets[1]);
202+
ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
203+
close(ipc_sockets[0]);
204+
205+
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
206+
EXPECT_EQ(ret, 0);
207+
146208
self->child_pid2 = create_child(&self->child_pidfd2,
147209
CLONE_NEWUSER | CLONE_NEWNS |
148210
CLONE_NEWCGROUP | CLONE_NEWIPC |
@@ -151,10 +213,24 @@ FIXTURE_SETUP(current_nsset)
151213
EXPECT_GE(self->child_pid2, 0);
152214

153215
if (self->child_pid2 == 0) {
216+
close(ipc_sockets[0]);
217+
218+
if (!switch_timens())
219+
_exit(EXIT_FAILURE);
220+
221+
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
222+
_exit(EXIT_FAILURE);
223+
224+
close(ipc_sockets[1]);
225+
154226
pause();
155227
_exit(EXIT_SUCCESS);
156228
}
157229

230+
close(ipc_sockets[1]);
231+
ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
232+
close(ipc_sockets[0]);
233+
158234
for (i = 0; i < PIDFD_NS_MAX; i++) {
159235
char p[100];
160236

0 commit comments

Comments
 (0)