Skip to content

Commit 1140b13

Browse files
committed
Honor MAP_SHARED coherence across fork
Both fork paths (CoW shm and legacy IPC byte-copy) silently broke MAP_SHARED visibility across fork: the child mapped the slab MAP_PRIVATE or got a fresh byte copy, so writes from either side stayed local and never reached the kernel page cache the parent shared with the file. MAP_SHARED|MAP_ANONYMOUS, the standard parent-child IPC primitive used by Postgres and other multi-process daemons, was equally broken. Three pieces close the gap: 1. Parent-side conversion (mmap_fork_prepare_anon_shared, with commit/abort wrappers). While siblings are quiesced the fork thread walks live regions, promotes each MAP_SHARED|MAP_ANONYMOUS region without a backing fd into a memfd-style overlay (mkstemp+unlink+ftruncate, pwrite-seed from host_base, host MAP_FIXED|MAP_SHARED via the new hvf_apply_file_overlay_quiesced helper, mark_overlay_metadata_range), and pre-stages per-region dup() fds so a transient EMFILE rolls back cleanly. The candidate filter skips regions whose host-page-rounded tail would alias a neighbor mapping. The transactional commit/abort wrappers let the fork-IPC failure path roll back the in-place conversion (overlay teardown plus region metadata restore) before resuming siblings; abort validates every captured snapshot before tearing down so a sibling-drift past the quiesce timeout does not leave host VA out of sync with semantic state. forkipc.c logs a warning when abort returns a partial failure so the parent's stale state is visible in post-mortem. 2. Child-side restoration (mmap_fork_restore_overlays). The recv path now snapshots parent overlay_active/start/end (and a new parent_had_fd[] mirror) before clearing inherited state, then re-runs hvf_apply_file_overlay against the saved overlay span once SCM_RIGHTS delivers the backing fds. The inner quiesce is a no-op since no worker vCPUs exist yet. 3. Pre-existing fork-IPC alignment bug. The old recv_backing_fds filter (!MAP_ANONYMOUS && offset != -1) matched the shim region (LINUX_MAP_PRIVATE, offset 0) and ELF text segments and silently stole incoming SCM_RIGHTS fds, leaving the actual file-backed regions with backing_fd=-1. The receiver now uses parent_had_fd[] as the filter so its iteration order matches the sender's "backing_fd >= 0" filter exactly. Unassigned fds are closed instead of leaked. hvf_apply_file_overlay and hvf_remove_file_overlay are split into a public variant that handles thread_quiesce_siblings and a _quiesced inner that the parent fork-prep / abort paths call without a nested barrier. Locked in by tests/test-cross-fork-mapshared.c (3 cases: file-backed mkstemp, MAP_SHARED|MAP_ANONYMOUS, /dev/shm via shm_open). Each case verifies pre-fork seed visibility, child-write-visible-to-parent, parent-write-visible-to-child, and on-disk reconciliation. All three pass against Linux ground truth via tests/qemu-runner.sh.
1 parent c5ccb22 commit 1140b13

6 files changed

Lines changed: 1202 additions & 137 deletions

File tree

src/runtime/fork-state.c

Lines changed: 115 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "debug/log.h"
2222
#include "syscall/abi.h"
2323
#include "syscall/internal.h"
24+
#include "syscall/mem.h"
2425
#include "syscall/proc.h"
2526

2627
int fork_ipc_write_all(int fd, const void *buf, size_t len)
@@ -494,7 +495,9 @@ static int fork_ipc_drain_bytes(int ipc_fd, uint32_t len)
494495
return 0;
495496
}
496497

497-
static int fork_ipc_recv_backing_fds(int ipc_fd, guest_t *g)
498+
static int fork_ipc_recv_backing_fds(int ipc_fd,
499+
guest_t *g,
500+
const bool *parent_had_fd)
498501
{
499502
uint32_t nbacking;
500503
if (fork_ipc_read_all(ipc_fd, &nbacking, sizeof(nbacking)) < 0) {
@@ -518,19 +521,59 @@ static int fork_ipc_recv_backing_fds(int ipc_fd, guest_t *g)
518521
.msg_controllen = cmsg_sz,
519522
};
520523
ssize_t nr = recvmsg(ipc_fd, &msg, 0);
521-
if (nr > 0) {
522-
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
523-
if (cmsg && cmsg->cmsg_level == SOL_SOCKET &&
524-
cmsg->cmsg_type == SCM_RIGHTS) {
525-
int *region_fds = (int *) CMSG_DATA(cmsg);
526-
uint32_t fi = 0;
527-
for (int i = 0; i < g->nregions && fi < nbacking; i++) {
528-
if (!(g->regions[i].flags & LINUX_MAP_ANONYMOUS) &&
529-
g->regions[i].offset != (uint64_t) -1) {
530-
g->regions[i].backing_fd = region_fds[fi++];
531-
}
532-
}
533-
}
524+
if (nr <= 0) {
525+
free(cmsg_buf);
526+
return -1;
527+
}
528+
529+
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
530+
if (msg.msg_flags & MSG_CTRUNC) {
531+
log_error("fork-child: backing fd SCM_RIGHTS payload truncated");
532+
free(cmsg_buf);
533+
return -1;
534+
}
535+
if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
536+
cmsg->cmsg_type != SCM_RIGHTS) {
537+
log_error("fork-child: missing backing fd SCM_RIGHTS payload");
538+
free(cmsg_buf);
539+
return -1;
540+
}
541+
if (cmsg->cmsg_len < CMSG_LEN(0)) {
542+
free(cmsg_buf);
543+
return -1;
544+
}
545+
546+
int *region_fds = (int *) CMSG_DATA(cmsg);
547+
uint32_t nreceived =
548+
(uint32_t) ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int));
549+
uint32_t fi = 0;
550+
551+
/* Sender (fork_ipc_send_backing_fds) iterates regions and sends one fd per
552+
* region with backing_fd >= 0. The receiver must iterate in the same order
553+
* over regions that had backing_fd in the parent. parent_had_fd[i] is
554+
* captured by the caller before backing_fd is cleared.
555+
*
556+
* The original filter (!MAP_ANONYMOUS && offset != -1) matched extra
557+
* regions like the shim and ELF text, so the first received fd was
558+
* misassigned and the actual file-backed region was left without
559+
* backing_fd.
560+
*/
561+
for (int i = 0; i < g->nregions && fi < nreceived; i++) {
562+
if (parent_had_fd && parent_had_fd[i])
563+
g->regions[i].backing_fd = region_fds[fi++];
564+
}
565+
566+
/* Close any received fds that did not get assigned: avoids leaking host fds
567+
* into the child's process table when a mismatch occurs.
568+
*/
569+
while (fi < nreceived)
570+
close(region_fds[fi++]);
571+
572+
if (nreceived != nbacking) {
573+
log_error("fork-child: expected %u backing fds but received %u",
574+
nbacking, nreceived);
575+
free(cmsg_buf);
576+
return -1;
534577
}
535578
free(cmsg_buf);
536579
return 0;
@@ -618,23 +661,73 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig)
618661
return -1;
619662
}
620663
g->nregions = (int) num_guest_regions;
664+
665+
/* Capture parent state before clearing the inherited overlay/backing fd
666+
* fields. parent_had_fd lets recv_backing_fds iterate in the same order the
667+
* sender used (regions with backing_fd >= 0); the parent_ovl_* arrays let
668+
* mmap_fork_restore_overlays know which regions to re-install, with what
669+
* overlay span. Heap-allocated to avoid pushing hundreds of KiB onto the
670+
* recv stack frame.
671+
*/
672+
bool *parent_had_fd = NULL;
673+
bool *parent_active = NULL;
674+
uint64_t *parent_ovl_start = NULL;
675+
uint64_t *parent_ovl_end = NULL;
676+
if (g->nregions > 0) {
677+
parent_had_fd = calloc((size_t) g->nregions, sizeof(*parent_had_fd));
678+
parent_active = calloc((size_t) g->nregions, sizeof(*parent_active));
679+
parent_ovl_start =
680+
calloc((size_t) g->nregions, sizeof(*parent_ovl_start));
681+
parent_ovl_end = calloc((size_t) g->nregions, sizeof(*parent_ovl_end));
682+
if (!parent_had_fd || !parent_active || !parent_ovl_start ||
683+
!parent_ovl_end) {
684+
log_error("fork-child: parent overlay buffer alloc failed");
685+
free(parent_had_fd);
686+
free(parent_active);
687+
free(parent_ovl_start);
688+
free(parent_ovl_end);
689+
return -1;
690+
}
691+
for (int i = 0; i < g->nregions; i++) {
692+
parent_had_fd[i] = (g->regions[i].backing_fd >= 0);
693+
parent_active[i] = g->regions[i].overlay_active;
694+
parent_ovl_start[i] = g->regions[i].overlay_start;
695+
parent_ovl_end[i] = g->regions[i].overlay_end;
696+
}
697+
}
698+
621699
for (int i = 0; i < g->nregions; i++) {
622700
g->regions[i].backing_fd = -1;
623-
/* Demote inherited overlays: the child does not yet re-establish
624-
* host MAP_FIXED|MAP_SHARED mappings from the parent's overlay
625-
* fds, so msync, MADV_DONTNEED and friends must use the
626-
* snapshot-style emulation. The CoW path's pre-fork sync of
627-
* overlay bytes into shm_fd already gave the child snapshot the
628-
* correct content at fork time. Live cross-fork MAP_SHARED
629-
* coherence is the next P1 TODO item.
701+
/* Drop inherited overlay metadata; the host MAP_FIXED|MAP_SHARED
702+
* mapping does not exist yet in the child. Re-establishment runs after
703+
* fork_ipc_recv_backing_fds populates backing_fd from the
704+
* parent-supplied SCM_RIGHTS bundle.
630705
*/
631706
g->regions[i].overlay_active = false;
632707
g->regions[i].overlay_start = 0;
633708
g->regions[i].overlay_end = 0;
634709
}
635710

636-
if (fork_ipc_recv_backing_fds(ipc_fd, g) < 0)
711+
if (fork_ipc_recv_backing_fds(ipc_fd, g, parent_had_fd) < 0) {
712+
free(parent_had_fd);
713+
free(parent_active);
714+
free(parent_ovl_start);
715+
free(parent_ovl_end);
637716
return -1;
717+
}
718+
719+
/* Re-install MAP_SHARED overlays for every region the parent had as
720+
* overlay_active and that now carries a backing fd. Failures here fall back
721+
* to snapshot semantics for the affected region; the child still boots and
722+
* can run.
723+
*/
724+
if (g->nregions > 0)
725+
(void) mmap_fork_restore_overlays(g, parent_active, parent_ovl_start,
726+
parent_ovl_end);
727+
free(parent_had_fd);
728+
free(parent_active);
729+
free(parent_ovl_start);
730+
free(parent_ovl_end);
638731

639732
if (fork_ipc_read_all(ipc_fd, sig, sizeof(*sig)) < 0) {
640733
log_error("fork-child: failed to read signal state");

src/runtime/forkipc.c

Lines changed: 52 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535

3636
#include "syscall/abi.h"
3737
#include "syscall/internal.h"
38+
#include "syscall/mem.h"
3839
#include "syscall/net.h" /* absock namespace IPC state */
3940
#include "syscall/poll.h" /* wakeup_pipe_signal */
4041
#include "syscall/proc.h"
@@ -89,8 +90,8 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
8990
absock_set_namespace_id(hdr.absock_namespace_id);
9091
proc_set_session(hdr.sid, hdr.pgid);
9192

92-
/* Create guest memory before receiving state so all incoming offsets can
93-
* be bounds-checked against the negotiated guest size.
93+
/* Create guest memory before receiving state so all incoming offsets can be
94+
* bounds-checked against the negotiated guest size.
9495
*/
9596
guest_t g;
9697

@@ -176,6 +177,7 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
176177
guest_destroy(&g);
177178
return 1;
178179
}
180+
179181
/* POSIX: "Signals pending to the parent shall not be pending to the child."
180182
* Clear pending bitmask and RT queue before applying state.
181183
* signal_set_state() is deferred until after thread_register_main()
@@ -218,17 +220,17 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
218220
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, regs.tpidr_el0));
219221

220222
/* Enable MMU directly (page tables already in guest memory from IPC).
221-
* SCTLR must include MMU-enable (M), caches (C, I), RES1 bits,
222-
* and EL0 cache maintenance access (UCI, UCT) for JIT translators.
223+
* SCTLR must include MMU-enable (M), caches (C, I), RES1 bits, and EL0
224+
* cache maintenance access (UCI, UCT) for JIT translators.
223225
*/
224226
uint64_t sctlr_with_mmu = SCTLR_RES1 | SCTLR_M | SCTLR_C | SCTLR_I |
225227
SCTLR_DZE | SCTLR_UCT | SCTLR_UCI;
226228
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, sctlr_with_mmu));
227229

228-
/* Restore all 31 GPRs from parent state, then override X0=0 (child
229-
* clone return value). This preserves X1-X30 exactly as they were when
230-
* the parent called clone(), which is required by the Linux syscall ABI
231-
* (especially callee-saved X19-X28, FP=X29, LR=X30).
230+
/* Restore all 31 GPRs from parent state, then override X0=0 (child clone
231+
* return value). This preserves X1-X30 exactly as they were when the parent
232+
* called clone(), which is required by the Linux syscall ABI (especially
233+
* callee-saved X19-X28, FP=X29, LR=X30).
232234
*/
233235
vcpu_restore_gprs(vcpu, regs.x);
234236
vcpu_set_gpr(vcpu, 0, 0); /* Child gets 0 from clone */
@@ -246,14 +248,14 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
246248

247249
/* Register the fork child's main thread in the thread table.
248250
* Without this, current_thread is NULL and any syscall handler that
249-
* accesses per-thread state (signal masks, ptrace, CLONE_THREAD)
250-
* will dereference NULL.
251+
* accesses per-thread state (signal masks, ptrace, CLONE_THREAD) will
252+
* dereference NULL.
251253
*/
252254
thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1);
253255

254256
/* Now that current_thread is set, apply signal state. This must happen
255-
* after thread_register_main() so the per-thread blocked mask and
256-
* altstack are properly restored to the thread entry.
257+
* after thread_register_main() so the per-thread blocked mask and altstack
258+
* are properly restored to the thread entry.
257259
*/
258260
signal_set_state(&sig);
259261

@@ -921,6 +923,22 @@ int64_t sys_clone(hv_vcpu_t vcpu,
921923
*/
922924
thread_quiesce_siblings();
923925

926+
mmap_fork_anon_shared_txn_t *anon_shared_txn = NULL;
927+
guest_region_t *regions_snapshot = NULL;
928+
929+
/* Convert MAP_SHARED|MAP_ANONYMOUS regions that have no backing fd
930+
* into memfd-backed overlay regions. The conversion seeds a private
931+
* temp file with the current bytes and installs a host
932+
* MAP_SHARED|MAP_FIXED overlay on the parent. The child receives the
933+
* fd via SCM_RIGHTS and re-installs its own overlay so subsequent
934+
* writes from either side flow through the kernel page cache and
935+
* reach the other. File-backed MAP_SHARED regions already carry a
936+
* backing fd and are unaffected. Misaligned shared regions
937+
* (snapshot-style) remain incoherent across fork by design.
938+
*/
939+
if (mmap_fork_prepare_anon_shared(g, &anon_shared_txn) < 0)
940+
goto fail_snapshot;
941+
924942
/* Determine if elfuse can use the CoW (shm) fast path.
925943
* If shm_fd >= 0, elfuse freezes a snapshot via MAP_PRIVATE and sends the
926944
* shm fd to the child. Otherwise fall back to region-by-region copy.
@@ -947,8 +965,6 @@ int64_t sys_clone(hv_vcpu_t vcpu,
947965
* but before sibling vCPUs resume. Declared up front so all goto paths to
948966
* fail_snapshot can free it unconditionally.
949967
*/
950-
guest_region_t *regions_snapshot = NULL;
951-
952968
/* Header */
953969
ipc_header_t hdr = {
954970
.magic = IPC_MAGIC_HEADER,
@@ -1064,25 +1080,25 @@ int64_t sys_clone(hv_vcpu_t vcpu,
10641080
if (nregions_snapshot > 0) {
10651081
regions_snapshot = malloc(snap_sz);
10661082
if (!regions_snapshot) {
1067-
thread_resume_siblings();
1068-
close(ipc_sock);
1069-
return -LINUX_ENOMEM;
1083+
goto fail_snapshot;
10701084
}
10711085
memcpy(regions_snapshot, g->regions, snap_sz);
10721086
}
10731087

10741088
if (fork_ipc_send_fd_table(ipc_sock) < 0)
10751089
goto fail_snapshot;
10761090

1077-
/* Resume sibling vCPUs now that the memory snapshot, semantic region
1078-
* snapshot, and FD snapshot have been serialized.
1079-
*/
1080-
thread_resume_siblings();
1081-
10821091
uint32_t num_guest_regions = (uint32_t) nregions_snapshot;
10831092
if (fork_ipc_send_process_state(ipc_sock, regions_snapshot,
10841093
num_guest_regions) < 0)
1085-
goto fail_ipc;
1094+
goto fail_snapshot;
1095+
1096+
/* The process-state payload includes the SCM_RIGHTS handoff for region
1097+
* backing fds. Keep siblings quiesced until that send completes so a
1098+
* concurrent munmap/remap cannot close or recycle the captured fd numbers.
1099+
*/
1100+
thread_resume_siblings();
1101+
mmap_fork_commit_anon_shared(&anon_shared_txn);
10861102

10871103
close(ipc_sock);
10881104

@@ -1112,13 +1128,21 @@ int64_t sys_clone(hv_vcpu_t vcpu,
11121128
free(regions_snapshot);
11131129
return child_guest_pid;
11141130

1115-
fail_ipc:
1116-
free(regions_snapshot);
1117-
close(ipc_sock);
1118-
return -LINUX_ENOMEM;
1119-
11201131
fail_snapshot:
11211132
free(regions_snapshot);
1133+
/* Roll back the in-place anon-shared overlay conversion while
1134+
* siblings are still parked. A partial rollback failure (e.g.,
1135+
* region drift past the quiesce timeout) leaves the parent in a
1136+
* mixed state: the originating fork-IPC error is the user-visible
1137+
* one, but log abort failures so post-mortem can spot the
1138+
* lingering overlay without grepping for behavioral symptoms.
1139+
*/
1140+
int abort_rc = mmap_fork_abort_anon_shared(g, &anon_shared_txn);
1141+
if (abort_rc < 0)
1142+
log_warn(
1143+
"clone: anon-shared rollback partial failure (%d); parent "
1144+
"may have stale memfd-backed regions",
1145+
abort_rc);
11221146
thread_resume_siblings();
11231147
close(ipc_sock);
11241148
return -LINUX_ENOMEM;

0 commit comments

Comments
 (0)