Skip to content

Commit e9b3f74

Browse files
committed
fix: defer reopen of live-thread proc fds until after clone()
Instead of rewriting /proc/<pid>/task/<tid>/... paths to the leader thread (which returns wrong data), defer the actual open until after threads exist in the restorer blob. Phase 1 (collect): Mark live-thread proc paths with deferred_thread_fd instead of rewriting them. Dead-thread TASK_HELPER case is unchanged. Phase 2 (placeholder): Open /dev/null to reserve the fd number, then collect deferred fd metadata into rst_mem for the restorer. Phase 3 (restorer): After clone() creates all threads, loop through deferred fds and reopen via sys_openat(proc_fd, path) + sys_dup2() + sys_lseek(). At this point /proc/<pid>/task/<tid>/ exists. The test is expanded to verify both dead-thread (fd validity check) and live-thread (reads tid from /proc/self/task/<tid>/stat to prove it points to the correct thread, not the leader) cases.
1 parent ca27a87 commit e9b3f74

File tree

6 files changed

+293
-67
lines changed

6 files changed

+293
-67
lines changed

criu/cr-restore.c

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -272,16 +272,19 @@ static int crtools_prepare_shared(void)
272272
*/
273273

274274
static struct collect_image_info *cinfos[] = {
275-
&file_locks_cinfo, &pipe_data_cinfo, &fifo_data_cinfo, &sk_queues_cinfo,
275+
&file_locks_cinfo,
276+
&pipe_data_cinfo,
277+
&fifo_data_cinfo,
278+
&sk_queues_cinfo,
276279
#ifdef CONFIG_HAS_LIBBPF
277280
&bpfmap_data_cinfo,
278281
#endif
279282
};
280283

281284
static struct collect_image_info *cinfos_files[] = {
282-
&unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo,
283-
&netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo,
284-
&tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo,
285+
&unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo,
286+
&netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo,
287+
&tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo,
285288
&fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo
286289
};
287290

@@ -500,6 +503,65 @@ static int collect_inotify_fds(struct task_restore_args *ta)
500503
return 0;
501504
}
502505

506+
static int collect_deferred_proc_fds(struct task_restore_args *ta)
507+
{
508+
struct list_head *list = &rsti(current)->fds;
509+
struct fdt *fdt = rsti(current)->fdt;
510+
struct fdinfo_list_entry *fle;
511+
512+
/* Only the fdt owner restores fds */
513+
if (fdt && fdt->pid != vpid(current))
514+
return 0;
515+
516+
ta->deferred_fds = (struct deferred_proc_fd *)rst_mem_align_cpos(RM_PRIVATE);
517+
ta->deferred_fds_n = 0;
518+
519+
list_for_each_entry(fle, list, ps_list) {
520+
struct file_desc *d = fle->desc;
521+
struct reg_file_info *rfi;
522+
struct deferred_proc_fd *df;
523+
char *orig_path;
524+
525+
if (d->ops->type != FD_TYPES__REG)
526+
continue;
527+
528+
rfi = container_of(d, struct reg_file_info, d);
529+
if (!rfi->deferred_thread_fd)
530+
continue;
531+
532+
df = rst_mem_alloc(sizeof(*df), RM_PRIVATE);
533+
if (!df)
534+
return -1;
535+
536+
orig_path = rfi->orig_path;
537+
538+
/*
539+
* orig_path is "proc/<pid>/task/<tid>/...", strip "proc/"
540+
* prefix to get a path relative to /proc for
541+
* sys_openat(proc_fd, ...).
542+
*/
543+
if (strncmp(orig_path, "proc/", 5) == 0)
544+
orig_path += 5;
545+
546+
if (strlen(orig_path) >= sizeof(df->path)) {
547+
pr_err("Deferred proc path too long: %s\n", orig_path);
548+
return -1;
549+
}
550+
551+
df->target_fd = fle->fe->fd;
552+
df->flags = rfi->rfe->flags;
553+
df->pos = rfi->rfe->pos;
554+
strncpy(df->path, orig_path, sizeof(df->path) - 1);
555+
df->path[sizeof(df->path) - 1] = '\0';
556+
557+
ta->deferred_fds_n++;
558+
pr_info("Collected deferred proc fd %d -> %s\n",
559+
df->target_fd, df->path);
560+
}
561+
562+
return 0;
563+
}
564+
503565
static int open_core(int pid, CoreEntry **pcore)
504566
{
505567
int ret;
@@ -676,6 +738,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
676738
if (collect_inotify_fds(ta) < 0)
677739
return -1;
678740

741+
if (collect_deferred_proc_fds(ta) < 0)
742+
return -1;
743+
679744
if (prepare_proc_misc(pid, core->tc, ta))
680745
return -1;
681746

@@ -1709,7 +1774,10 @@ static int restore_task_with_children(void *_arg)
17091774
}
17101775

17111776
int __attribute((weak)) arch_ptrace_restore(int pid, struct pstree_item *item);
1712-
int arch_ptrace_restore(int pid, struct pstree_item *item) { return 0; }
1777+
int arch_ptrace_restore(int pid, struct pstree_item *item)
1778+
{
1779+
return 0;
1780+
}
17131781

17141782
static int attach_to_tasks(bool root_seized)
17151783
{
@@ -3133,7 +3201,9 @@ static void *restorer_munmap_addr(CoreEntry *core, void *restorer_blob)
31333201
}
31343202

31353203
void arch_rsti_init(struct pstree_item *p) __attribute__((weak));
3136-
void arch_rsti_init(struct pstree_item *p) {}
3204+
void arch_rsti_init(struct pstree_item *p)
3205+
{
3206+
}
31373207

31383208
static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, unsigned long alen, CoreEntry *core)
31393209
{
@@ -3329,6 +3399,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
33293399
RST_MEM_FIXUP_PPTR(task_args->zombies);
33303400
RST_MEM_FIXUP_PPTR(task_args->vma_ios);
33313401
RST_MEM_FIXUP_PPTR(task_args->inotify_fds);
3402+
RST_MEM_FIXUP_PPTR(task_args->deferred_fds);
33323403

33333404
task_args->compatible_mode = core_is_compat(core);
33343405
/*

criu/files-reg.c

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2552,8 +2552,28 @@ int collect_filemap(struct vma_area *vma)
25522552

25532553
static int open_fe_fd(struct file_desc *fd, int *new_fd)
25542554
{
2555+
struct reg_file_info *rfi;
25552556
int tmp;
25562557

2558+
rfi = container_of(fd, struct reg_file_info, d);
2559+
2560+
if (rfi->deferred_thread_fd) {
2561+
/*
2562+
* This is a live-thread /proc/<pid>/task/<tid>/... fd.
2563+
* The thread doesn't exist yet (created later by the
2564+
* restorer via clone()), so open /dev/null as a
2565+
* placeholder. The restorer will reopen the correct
2566+
* path after threads are created.
2567+
*/
2568+
tmp = open("/dev/null", rfi->rfe->flags & O_ACCMODE);
2569+
if (tmp < 0) {
2570+
pr_perror("Can't open /dev/null for deferred proc fd");
2571+
return -1;
2572+
}
2573+
*new_fd = tmp;
2574+
return 0;
2575+
}
2576+
25572577
tmp = open_path(fd, do_open_reg, NULL);
25582578
if (tmp < 0)
25592579
return -1;
@@ -2690,18 +2710,15 @@ static int fixup_thread_proc_path(struct reg_file_info *rfi)
26902710
/*
26912711
* Live thread: the thread exists in the pstree but
26922712
* won't be created until the restorer blob runs
2693-
* clone(), which is after prepare_fds(). Rewrite the
2694-
* path to use the thread-group leader's tid instead;
2695-
* the leader is the only thread that exists at
2696-
* prepare_fds() time.
2713+
* clone(), which is after prepare_fds(). Mark this
2714+
* file as deferred -- open_fe_fd() will open /dev/null
2715+
* as a placeholder, and the restorer will reopen the
2716+
* correct path after threads exist.
26972717
*/
2698-
new_path = xmalloc(5 + 20 + 6 + 20 + strlen(tid_end) + 1);
2699-
if (!new_path)
2700-
return -1;
2701-
2702-
sprintf(new_path, "proc/%d/task/%d%s", pid, pid, tid_end);
2703-
pr_info("Rewrote live thread path: %s -> %s\n",
2704-
rfi->path, new_path);
2718+
rfi->deferred_thread_fd = true;
2719+
rfi->orig_path = rfi->path;
2720+
pr_info("Deferred live thread proc path: %s\n", rfi->path);
2721+
return 0;
27052722
}
27062723

27072724
rfi->path = new_path;
@@ -2722,6 +2739,8 @@ static int collect_one_regfile(void *o, ProtobufCMessage *base, struct cr_img *i
27222739
rfi->path = rfi->rfe->name + 1;
27232740
rfi->remap = NULL;
27242741
rfi->size_mode_checked = false;
2742+
rfi->deferred_thread_fd = false;
2743+
rfi->orig_path = NULL;
27252744

27262745
if (fixup_thread_proc_path(rfi))
27272746
return -1;

criu/include/files-reg.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ struct reg_file_info {
2424
struct file_remap *remap;
2525
bool size_mode_checked;
2626
bool is_dir;
27+
bool deferred_thread_fd; /* placeholder for live thread proc fd */
2728
char *path;
29+
char *orig_path; /* original proc path for deferred reopen */
2830
};
2931

3032
extern int open_reg_by_id(u32 id);

criu/include/restorer.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,13 @@ struct restore_vma_io {
141141

142142
#define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec))
143143

144+
struct deferred_proc_fd {
145+
int target_fd; /* fd number to dup2 over */
146+
int flags; /* open flags (O_RDONLY etc) */
147+
off_t pos; /* file position to restore */
148+
char path[128]; /* path relative to /proc, e.g. "1/task/7/stat" */
149+
};
150+
144151
struct task_restore_args {
145152
struct thread_restore_args *t; /* thread group leader */
146153

@@ -196,6 +203,9 @@ struct task_restore_args {
196203
int *inotify_fds; /* fds to cleanup inotify events at CR_STATE_RESTORE_SIGCHLD stage */
197204
unsigned int inotify_fds_n;
198205

206+
struct deferred_proc_fd *deferred_fds; /* live-thread proc fds to reopen after clone() */
207+
unsigned int deferred_fds_n;
208+
199209
/* * * * * * * * * * * * * * * * * * * * */
200210

201211
unsigned long task_size;
@@ -256,7 +266,7 @@ struct task_restore_args {
256266
* For arm64 stack needs to aligned to 16 bytes.
257267
* Hence align to 16 bytes for all
258268
*/
259-
#define RESTORE_ALIGN_STACK(start, size) (ALIGN((start) + (size)-16, 16))
269+
#define RESTORE_ALIGN_STACK(start, size) (ALIGN((start) + (size) - 16, 16))
260270

261271
static inline unsigned long restorer_stack(struct restore_mem_zone *mz)
262272
{

criu/pie/restorer.c

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,6 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ
366366
}
367367
}
368368

369-
370369
if (lsm_type != LSMTYPE__SELINUX) {
371370
/*
372371
* SELinux does not support setting the process context for
@@ -2198,6 +2197,36 @@ __visible long __export_restore_task(struct task_restore_args *args)
21982197
sys_close(fd);
21992198
}
22002199

2200+
/*
2201+
* Reopen deferred /proc/<pid>/task/<tid>/... fds now that
2202+
* all threads have been created by clone() above. The fds
2203+
* currently point to /dev/null placeholders.
2204+
*/
2205+
for (i = 0; i < args->deferred_fds_n; i++) {
2206+
struct deferred_proc_fd *df = &args->deferred_fds[i];
2207+
int dfd;
2208+
2209+
dfd = sys_openat(args->proc_fd, df->path, df->flags, 0);
2210+
if (dfd < 0) {
2211+
pr_err("Can't reopen deferred proc fd %d (%s): %ld\n",
2212+
df->target_fd, df->path, (long)dfd);
2213+
goto core_restore_end;
2214+
}
2215+
2216+
if (dfd != df->target_fd) {
2217+
if (sys_dup2(dfd, df->target_fd) != df->target_fd) {
2218+
pr_err("Can't dup2 deferred proc fd %d -> %d\n",
2219+
dfd, df->target_fd);
2220+
sys_close(dfd);
2221+
goto core_restore_end;
2222+
}
2223+
sys_close(dfd);
2224+
}
2225+
2226+
if (df->pos != 0 && df->pos != (off_t)-1)
2227+
sys_lseek(df->target_fd, df->pos, SEEK_SET);
2228+
}
2229+
22012230
restore_rlims(args);
22022231

22032232
ret = create_posix_timers(args);

0 commit comments

Comments
 (0)