Skip to content

Commit 84619f7

Browse files
committed
Optimize COW fork: bypass seccomp notif via raw fork(2) syscall
Signed-off-by: Cong Wang <cwang@multikernel.io>
1 parent 3ff6ba1 commit 84619f7

File tree

4 files changed

+106
-15
lines changed

4 files changed

+106
-15
lines changed

src/sandlock/_checkpoint.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -412,14 +412,38 @@ def start_child_listener(
412412
TRIGGER_FORK = b"\x03"
413413

414414

415+
import ctypes as _ctypes
416+
import ctypes.util as _ctypes_util
417+
import platform as _platform
418+
419+
_libc = _ctypes.CDLL(_ctypes_util.find_library("c"), use_errno=True)
420+
_libc.syscall.restype = _ctypes.c_long
421+
_NR_FORK = 57 if _platform.machine() == "x86_64" else None
422+
423+
424+
def _raw_fork() -> int:
425+
"""Raw fork(2) syscall, bypassing glibc clone() wrapper.
426+
427+
Python's ``os.fork()`` uses ``clone``, which seccomp intercepts
428+
via USER_NOTIF for process counting. The ``fork`` syscall (NR 57)
429+
is NOT intercepted, so it goes straight through BPF.
430+
"""
431+
if _NR_FORK is None:
432+
return os.fork()
433+
pid = _libc.syscall(_ctypes.c_long(_NR_FORK))
434+
if pid < 0:
435+
err = _ctypes.get_errno()
436+
raise OSError(err, os.strerror(err))
437+
return pid
438+
439+
415440
def clone_ready_loop(control_fd: int, work_fn: "Callable") -> None:
416441
"""Main-thread loop: wait for fork commands, fork and run work_fn.
417442
418443
After init_fn returns, the main thread enters this loop. It blocks
419444
on ``os.read()`` (GIL released), so no CPU is wasted. When the
420-
parent sends TRIGGER_FORK, the main thread calls ``os.fork()`` —
421-
giving the clone a full COW copy of all memory (including
422-
everything init_fn set up in globals/heap).
445+
parent sends TRIGGER_FORK, the main thread calls raw ``fork(2)``
446+
(not ``os.fork()``), bypassing the seccomp USER_NOTIF round-trip.
423447
424448
Args:
425449
control_fd: Child's end of the control socket.
@@ -450,7 +474,7 @@ def clone_ready_loop(control_fd: int, work_fn: "Callable") -> None:
450474
pass
451475

452476
try:
453-
pid = os.fork()
477+
pid = _raw_fork()
454478
except OSError:
455479
os.write(control_fd, struct.pack(">I", 0))
456480
continue

src/sandlock/_context.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,15 @@ def _notif_syscall_names(notif: "NotifPolicy") -> list[str]:
9797
names = ["openat"]
9898
if "open" in _SYSCALL_NR:
9999
names.append("open")
100-
# Always intercept clone/fork/vfork/clone3 — the supervisor checks
101-
# namespace flags (which BPF can't inspect for clone3) and tracks
102-
# process creation.
103-
names.extend(["clone", "clone3", "fork", "vfork"])
100+
# Intercept clone/clone3/vfork via USER_NOTIF for namespace flag
101+
# checks and process counting. Clone namespace flags are also
102+
# blocked by a BPF arg filter as defense in depth.
103+
#
104+
# The raw fork syscall (NR 57) is NOT intercepted. It takes no
105+
# flags and cannot create namespaces. The COW fork template uses
106+
# raw fork(2) via ctypes to bypass the seccomp notif round-trip.
107+
# User code uses os.fork() which calls clone (intercepted).
108+
names.extend(["clone", "clone3", "vfork"])
104109
if notif is not None and notif.allowed_ips:
105110
names.extend(["connect", "sendto"])
106111
if notif is not None and notif.port_remap:

src/sandlock/_seccomp.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -285,16 +285,25 @@ def _build_arg_filters() -> bytes:
285285
These filters check specific arguments rather than blocking the
286286
syscall entirely:
287287
288-
- ioctl(2): Block TIOCSTI (terminal input injection). Normal
289-
terminal I/O and isatty() still work.
290-
291-
Note: clone/clone3 namespace flag checks are handled in the
292-
supervisor via USER_NOTIF, not here.
288+
- clone(2): Block namespace flags (CLONE_NEW*) with ERRNO.
289+
Plain forks fall through to the main filter (USER_NOTIF if
290+
clone is in the notif list, or ALLOW if not).
291+
- ioctl(2): Block TIOCSTI (terminal input injection).
293292
"""
294293
insns = bytearray()
295294

296-
# --- clone/clone3: handled via USER_NOTIF (namespace flag checks
297-
# and process tracking done in supervisor) ---
295+
# --- clone: block namespace creation flags ---
296+
nr_clone = _SYSCALL_NR.get("clone")
297+
if nr_clone is not None:
298+
# Load syscall number
299+
insns += _bpf_stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR)
300+
# if nr != clone, skip this block (3 instructions ahead)
301+
insns += _bpf_jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3)
302+
# Load clone flags (arg0, low 32 bits)
303+
insns += _bpf_stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO)
304+
# Test: flags & NS_FLAGS → ERRNO if set, fall through if not
305+
insns += _bpf_jump(BPF_JMP | BPF_JSET | BPF_K, _CLONE_NS_FLAGS, 0, 1)
306+
insns += _bpf_stmt(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | ERRNO_EPERM)
298307

299308
# --- ioctl: block TIOCSTI (terminal input injection) ---
300309
# Load syscall number

tests/test_clone.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,5 +95,58 @@ def child_side():
9595
child.close()
9696

9797

98+
class TestMaxProcessesInClone(unittest.TestCase):
99+
"""Verify that max_processes is enforced inside COW clones.
100+
101+
The template uses raw fork(2) to bypass seccomp USER_NOTIF, but
102+
clones inherit the seccomp filter that intercepts clone (os.fork).
103+
Process limits must still be enforced in work().
104+
"""
105+
106+
def test_clone_inherits_process_limit(self):
107+
"""work() cannot fork more than max_processes allows."""
108+
import sys
109+
import tempfile
110+
111+
marker = tempfile.mktemp(prefix="sandlock_test_maxproc_")
112+
113+
def init():
114+
pass
115+
116+
def work():
117+
count = 0
118+
for _ in range(20):
119+
try:
120+
pid = os.fork()
121+
if pid == 0:
122+
os._exit(0)
123+
os.waitpid(pid, 0)
124+
count += 1
125+
except OSError:
126+
break
127+
with open(marker, "w") as f:
128+
f.write(str(count))
129+
130+
policy = Policy(
131+
fs_writable=["/tmp"],
132+
fs_readable=[sys.prefix, "/usr", "/lib", "/etc", "/proc", "/dev"],
133+
max_processes=5,
134+
)
135+
136+
with Sandbox(policy, init, work) as sb:
137+
sb.fork().wait(timeout=10)
138+
139+
self.assertTrue(os.path.exists(marker))
140+
count = int(open(marker).read())
141+
os.unlink(marker)
142+
143+
# max_processes=5: the template's raw fork counts as 1 (via
144+
# the supervisor tracking clone3/vfork), so the clone can
145+
# fork at most 4 more times. The exact count depends on
146+
# how the supervisor counts, but it must be less than 20.
147+
self.assertLess(count, 20, "max_processes not enforced in clone")
148+
self.assertGreater(count, 0, "clone couldn't fork at all")
149+
150+
98151
if __name__ == "__main__":
99152
unittest.main()

0 commit comments

Comments
 (0)