Skip to content

Commit 3d2c86a

Browse files
committed
Add COW fork: Sandbox(policy, init, work) + sb.fork(env={})
Signed-off-by: Cong Wang <cwang@multikernel.io>
1 parent 65d74c3 commit 3d2c86a

File tree

5 files changed

+458
-12
lines changed

5 files changed

+458
-12
lines changed

examples/fork.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#!/usr/bin/env python3
2+
"""Demo: prove COW memory sharing works.
3+
4+
1. init() allocates a large buffer and stamps it with a unique ID
5+
2. work() reads the buffer — proves data survived fork (not re-init'd)
6+
3. Parent checks /proc/pid/smaps — proves pages are physically shared
7+
8+
Usage:
9+
python3 examples/try_clone.py
10+
"""
11+
12+
import os
13+
import sys
14+
import time
15+
import ctypes
16+
from sandlock import Sandbox, Policy
17+
18+
# Unique token set by init(), never re-set
19+
_TOKEN = None
20+
_BUF_ADDR = None
21+
_BUF_SIZE = 10 * 1024 * 1024 # 10 MB
22+
23+
24+
def init():
25+
"""Allocate 10 MB, fill with a unique token. Runs once."""
26+
global _TOKEN, _BUF_ADDR
27+
28+
_TOKEN = os.getpid() # unique per-process — proves it's from init's PID
29+
30+
# Allocate 10 MB via mmap (anonymous, private)
31+
libc = ctypes.CDLL("libc.so.6")
32+
libc.mmap.restype = ctypes.c_void_p
33+
libc.mmap.argtypes = [
34+
ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int,
35+
ctypes.c_int, ctypes.c_int, ctypes.c_long,
36+
]
37+
addr = libc.mmap(None, _BUF_SIZE, 0x3, 0x22, -1, 0) # RW, PRIVATE|ANON
38+
_BUF_ADDR = addr
39+
40+
# Fill with pattern: each 4KB page starts with the token
41+
buf = (ctypes.c_char * _BUF_SIZE).from_address(addr)
42+
import struct
43+
for offset in range(0, _BUF_SIZE, 4096):
44+
struct.pack_into("<Q", buf, offset, _TOKEN)
45+
46+
47+
def work():
48+
"""Read the buffer. Proves COW — data is from init()'s process."""
49+
import struct
50+
51+
seed = int(os.environ.get("SEED", "0"))
52+
my_pid = os.getpid()
53+
54+
# Read token from first page
55+
buf = (ctypes.c_char * 8).from_address(_BUF_ADDR)
56+
token = struct.unpack_from("<Q", buf, 0)[0]
57+
58+
# Verify ALL pages have the same token (no corruption)
59+
full_buf = (ctypes.c_char * _BUF_SIZE).from_address(_BUF_ADDR)
60+
all_match = True
61+
for offset in range(0, _BUF_SIZE, 4096):
62+
page_token = struct.unpack_from("<Q", full_buf, offset)[0]
63+
if page_token != token:
64+
all_match = False
65+
break
66+
67+
with open(f"/tmp/sandlock_cow_{seed}", "w") as f:
68+
f.write(f"clone_pid={my_pid} init_pid={token} "
69+
f"pages_ok={all_match} buf_addr=0x{_BUF_ADDR:x}")
70+
71+
72+
def get_shared_pages(pid):
73+
"""Read Shared_Clean + Shared_Dirty from /proc/pid/smaps."""
74+
shared = 0
75+
private = 0
76+
try:
77+
with open(f"/proc/{pid}/smaps") as f:
78+
for line in f:
79+
if line.startswith("Shared_Clean:") or line.startswith("Shared_Dirty:"):
80+
shared += int(line.split()[1])
81+
elif line.startswith("Private_Clean:") or line.startswith("Private_Dirty:"):
82+
private += int(line.split()[1])
83+
except OSError:
84+
pass
85+
return shared, private
86+
87+
88+
def main():
89+
policy = Policy(
90+
fs_writable=["/tmp"],
91+
fs_readable=[sys.prefix, "/usr", "/lib", "/etc", "/proc", "/dev"],
92+
)
93+
94+
print("=== COW Clone Proof ===\n", flush=True)
95+
96+
with Sandbox(policy, init, work) as sb:
97+
# Fork 3 clones, keep them alive briefly to check smaps
98+
clones = []
99+
for seed in range(3):
100+
clone = sb.fork(env={"SEED": str(seed)})
101+
clones.append((seed, clone))
102+
103+
# Give clones time to run
104+
time.sleep(1)
105+
106+
# Check shared pages for each clone
107+
for seed, clone in clones:
108+
pid = clone._clone_pid
109+
if pid:
110+
shared_kb, private_kb = get_shared_pages(pid)
111+
print(f" Clone {seed} (PID {pid}): "
112+
f"shared={shared_kb} KB, private={private_kb} KB",
113+
flush=True)
114+
115+
# Wait for all clones
116+
for seed, clone in clones:
117+
clone.wait(timeout=5)
118+
119+
# Read results
120+
print(flush=True)
121+
for seed in range(3):
122+
f = f"/tmp/sandlock_cow_{seed}"
123+
if os.path.exists(f):
124+
print(f" Clone {seed}: {open(f).read()}", flush=True)
125+
os.unlink(f)
126+
else:
127+
print(f" Clone {seed}: no output", flush=True)
128+
129+
print(f"\n 10 MB buffer allocated once in init().", flush=True)
130+
print(f" If shared > 0 KB, pages are physically shared (COW).", flush=True)
131+
print(f" If init_pid matches across clones, data survived fork.", flush=True)
132+
print("\nDone.", flush=True)
133+
134+
135+
if __name__ == "__main__":
136+
main()

src/sandlock/_checkpoint.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,100 @@ def start_child_listener(
407407
_CheckpointListener(control_fd, save_fn).start()
408408

409409

410+
# --- Clone-ready loop (runs on main thread after init_fn returns) ---
411+
412+
TRIGGER_FORK = b"\x03"
413+
414+
415+
def clone_ready_loop(control_fd: int, work_fn: "Callable") -> None:
416+
"""Main-thread loop: wait for fork commands, fork and run work_fn.
417+
418+
After init_fn returns, the main thread enters this loop. It blocks
419+
on ``os.read()`` (GIL released), so no CPU is wasted. When the
420+
parent sends TRIGGER_FORK, the main thread calls ``os.fork()`` —
421+
giving the clone a full COW copy of all memory (including
422+
everything init_fn set up in globals/heap).
423+
424+
Args:
425+
control_fd: Child's end of the control socket.
426+
work_fn: Function to run in each clone.
427+
"""
428+
import sys
429+
430+
while True:
431+
try:
432+
trigger = os.read(control_fd, 1)
433+
except OSError:
434+
break
435+
if not trigger:
436+
break
437+
438+
if trigger == TRIGGER_FORK:
439+
try:
440+
env_json = _recv_bytes(control_fd)
441+
except (EOFError, OSError):
442+
break
443+
444+
env = json.loads(env_json) if env_json else {}
445+
446+
try:
447+
sys.stdout.flush()
448+
sys.stderr.flush()
449+
except Exception:
450+
pass
451+
452+
try:
453+
pid = os.fork()
454+
except OSError:
455+
os.write(control_fd, struct.pack(">I", 0))
456+
continue
457+
458+
if pid == 0:
459+
# === Clone child ===
460+
try:
461+
os.close(control_fd)
462+
os.setpgid(0, 0)
463+
os.environ.update(env)
464+
work_fn()
465+
except SystemExit as e:
466+
os._exit(e.code if isinstance(e.code, int) else 1)
467+
except BaseException:
468+
os._exit(1)
469+
os._exit(0)
470+
else:
471+
# === Template: send clone PID back ===
472+
os.write(control_fd, struct.pack(">I", pid))
473+
474+
475+
def request_fork(
476+
control_fd: int,
477+
env: dict[str, str] | None = None,
478+
) -> int:
479+
"""Send a fork command to the template's clone-ready loop.
480+
481+
Args:
482+
control_fd: Parent's end of the control socket.
483+
env: Environment variable overrides for the clone.
484+
485+
Returns:
486+
PID of the clone child.
487+
488+
Raises:
489+
RuntimeError: If fork failed in the child.
490+
"""
491+
env_json = json.dumps(env or {}).encode()
492+
os.write(control_fd, TRIGGER_FORK)
493+
_send_bytes(control_fd, env_json)
494+
495+
raw = os.read(control_fd, 4)
496+
if len(raw) < 4:
497+
raise RuntimeError("Fork: no response from child")
498+
pid = struct.unpack(">I", raw)[0]
499+
if pid == 0:
500+
raise RuntimeError("Fork: fork() failed in child")
501+
return pid
502+
503+
410504
# --- Parent side ---
411505

412506
def request_app_state(control_fd: int) -> bytes:

src/sandlock/_context.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -197,11 +197,13 @@ def __init__(
197197
save_fn: Callable[[], bytes] | None = None,
198198
overlay_branch: "object | None" = None,
199199
cow_branch: "object | None" = None,
200+
clone_loop_fn: "Callable[[int], None] | None" = None,
200201
):
201202
self._target = target
202203
self._policy = policy
203204
self._sandbox_id = sandbox_id
204205
self._save_fn = save_fn
206+
self._clone_loop_fn = clone_loop_fn
205207
self._overlay_branch = overlay_branch or getattr(policy, '_overlay_branch', None)
206208
self._cow_branch = cow_branch or getattr(policy, '_cow_branch', None)
207209
self._pid: Optional[int] = None
@@ -650,10 +652,12 @@ def __enter__(self) -> "SandboxContext":
650652
if self._policy.strict:
651653
raise
652654

653-
# 5. Start checkpoint listener thread (if save_fn provided)
655+
# 5. Start checkpoint/clone listener thread (if save_fn provided)
654656
# Must happen BEFORE seccomp — seccomp blocks clone3
655657
# which Python's threading module uses.
656-
if self._save_fn is not None:
658+
if self._clone_loop_fn is not None:
659+
pass # Keep ctrl_child_fd open for clone_ready_loop
660+
elif self._save_fn is not None:
657661
try:
658662
start_child_listener(ctrl_child_fd, self._save_fn)
659663
except RuntimeError:
@@ -752,8 +756,11 @@ def __enter__(self) -> "SandboxContext":
752756
n = self._policy.max_open_files
753757
resource.setrlimit(resource.RLIMIT_NOFILE, (n, n))
754758

755-
# 10. Run target
756-
self._target()
759+
# 10. Run target (or clone-ready loop)
760+
if self._clone_loop_fn is not None:
761+
self._clone_loop_fn(ctrl_child_fd)
762+
else:
763+
self._target()
757764
os._exit(0)
758765
except SystemExit as e:
759766
os._exit(e.code if isinstance(e.code, int) else 1)

0 commit comments

Comments
 (0)