Skip to content

Commit e105b64

Browse files
committed
Fix vDSO patching: patch before dispatch, detect exec, pre-import modules
Signed-off-by: Cong Wang <cwang@multikernel.io>
1 parent 87a4ddf commit e105b64

File tree

4 files changed

+54
-53
lines changed

4 files changed

+54
-53
lines changed

src/sandlock/_context.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,10 @@ def __enter__(self) -> "SandboxContext":
475475
# just sys.modules lookups.
476476
if use_notif:
477477
from ._notif import install_notif_filter, send_fd # noqa: F811
478+
if (self._notif_policy is not None
479+
and self._notif_policy.time_start is not None):
480+
import time as _time # noqa: F811
481+
from ._vdso import disable_vdso_local # noqa: F811
478482
if self._save_fn is not None:
479483
from ._checkpoint import start_child_listener # noqa: F811
480484
# User namespace is needed for privileged mode or overlayfs

src/sandlock/_notif.py

Lines changed: 41 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -424,8 +424,7 @@ def __init__(
424424
# Deterministic time
425425
self._time_offset = None # TimeOffset | None
426426
self._mono_offset_s: int = 0 # monotonic offset for vDSO stubs
427-
self._vdso_patch_fd: int = -1 # pre-opened /proc/pid/mem
428-
self._vdso_patch_writes: list[tuple[int, bytes]] = [] # (offset, stub)
427+
self._vdso_patched_addr: int = 0 # vDSO base address we last patched
429428
self._virtual_btime: int = 0 # virtual boot time for /proc/stat
430429
if policy.time_start is not None:
431430
import time as _time
@@ -556,6 +555,12 @@ def _handle_one(self) -> None:
556555
if ret < 0:
557556
return # ENOENT = child died, EINTR = interrupted
558557

558+
# Patch vDSO before dispatching (child is stopped in seccomp
559+
# notification state, so /proc/pid/mem writes are reliable).
560+
# Re-patch when the vDSO address changes (exec replaces the vDSO).
561+
if self._time_offset is not None:
562+
self._maybe_patch_vdso(notif.pid)
563+
559564
try:
560565
self._dispatch(notif)
561566
except Exception:
@@ -566,53 +571,45 @@ def _handle_one(self) -> None:
566571
except Exception:
567572
pass
568573

569-
# Post-dispatch: patch vDSO for new PIDs (after exec).
570-
# Two-phase approach:
571-
# Phase 1 (this notification): pre-compute — open fd, parse
572-
# vDSO ELF, compute offsets. This is slow but the child
573-
# will be unfrozen when dispatch responds above.
574-
# Phase 2 (next notification): minimal lseek+write using the
575-
# pre-computed fd and offsets. Fast enough to land while
576-
# the child is briefly running after the previous response.
577-
if self._vdso_patch_writes:
578-
# Phase 2: fast write (child briefly running after respond)
579-
fd = self._vdso_patch_fd
580-
for off, stub in self._vdso_patch_writes:
581-
os.lseek(fd, off, os.SEEK_SET)
582-
os.write(fd, stub)
583-
os.close(fd)
584-
self._vdso_patch_fd = -1
585-
self._vdso_patch_writes = []
586-
elif self._time_offset is not None and self._vdso_patch_fd == -1:
587-
# Phase 1: pre-compute (first notification from new PID)
588-
pid = notif.pid
589-
from ._vdso import _find_vdso, _parse_vdso_symbols, _get_stubs
590-
info = _find_vdso(pid)
591-
stubs = _get_stubs(self._mono_offset_s)
592-
if info and stubs:
593-
addr, size = info
594-
try:
595-
fd = os.open(f"/proc/{pid}/mem", os.O_RDWR)
596-
os.lseek(fd, addr, os.SEEK_SET)
597-
data = os.read(fd, size)
598-
writes = []
599-
for name, off in _parse_vdso_symbols(data):
600-
stub = stubs.get(name)
601-
if stub:
602-
writes.append((addr + off, stub))
603-
if writes:
604-
self._vdso_patch_fd = fd
605-
self._vdso_patch_writes = writes
606-
else:
607-
os.close(fd)
608-
except OSError:
609-
pass
610-
611574
@property
612575
def tracked_pids(self) -> set[int]:
613576
"""All PIDs known to belong to this sandbox."""
614577
return set(self._proc_pids)
615578

579+
def _maybe_patch_vdso(self, pid: int) -> None:
580+
"""Patch the child's vDSO to force real syscalls, if needed.
581+
582+
Called while the child is stopped in seccomp notification state,
583+
so /proc/pid/mem writes land reliably before the child resumes.
584+
Tracks the vDSO base address to detect exec (which replaces the
585+
vDSO at a new address) and re-patch automatically.
586+
"""
587+
from ._vdso import _find_vdso, _parse_vdso_symbols, _get_stubs
588+
info = _find_vdso(pid)
589+
if not info:
590+
return
591+
addr, size = info
592+
if addr == self._vdso_patched_addr:
593+
return # already patched this vDSO
594+
stubs = _get_stubs(self._mono_offset_s)
595+
if not stubs:
596+
return
597+
try:
598+
fd = os.open(f"/proc/{pid}/mem", os.O_RDWR)
599+
try:
600+
os.lseek(fd, addr, os.SEEK_SET)
601+
data = os.read(fd, size)
602+
for name, off in _parse_vdso_symbols(data):
603+
stub = stubs.get(name)
604+
if stub:
605+
os.lseek(fd, addr + off, os.SEEK_SET)
606+
os.write(fd, stub)
607+
self._vdso_patched_addr = addr
608+
finally:
609+
os.close(fd)
610+
except OSError:
611+
pass
612+
616613
def _dispatch(self, notif: SeccompNotif) -> None:
617614
"""Route a notification to the appropriate handler."""
618615
# Lazily track every PID that makes an intercepted syscall

src/sandlock/_vdso.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
so seccomp can intercept them for time virtualization.
88
99
For the forked child (before exec): patches in-process via mprotect + write.
10-
For Sandbox.run (after exec): patches via /proc/pid/mem with retries,
11-
since writes only take effect when the child is not in seccomp-stop.
10+
For Sandbox.run (after exec): patches via /proc/pid/mem while the child
11+
is stopped in seccomp notification state, ensuring reliable delivery.
1212
"""
1313

1414
from __future__ import annotations

tests/test_integration.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,7 +1323,7 @@ def test_cow_utimensat_committed(self, isolation):
13231323
# --- Deterministic time ---
13241324

13251325
class TestDeterministicTime:
1326-
@pytest.mark.xfail(reason="vDSO patching via /proc/pid/mem is racy for exec'd processes")
1326+
13271327
def test_time_start_accepts_unix_timestamp(self):
13281328
"""time_start accepts a numeric Unix timestamp."""
13291329
# 946684800 = 2000-01-01T00:00:00Z
@@ -1399,7 +1399,7 @@ def test_time_start_monotonic_near_zero_run(self):
13991399
mono = float(result.stdout.strip())
14001400
assert mono < 5.0, f"Monotonic too high: {mono}"
14011401

1402-
@pytest.mark.xfail(reason="vDSO patching via /proc/pid/mem is racy for exec'd processes")
1402+
14031403
def test_time_syscall_shifted_run_ctypes(self):
14041404
"""time() syscall is shifted (used by uptime, w, etc.)."""
14051405
policy = Policy(
@@ -1408,8 +1408,8 @@ def test_time_syscall_shifted_run_ctypes(self):
14081408
)
14091409
result = Sandbox(policy).run(
14101410
["python3", "-c",
1411-
"import ctypes, ctypes.util; "
1412-
"libc = ctypes.CDLL(ctypes.util.find_library('c')); "
1411+
"import ctypes; "
1412+
"libc = ctypes.CDLL(None); "
14131413
"libc.time.restype = ctypes.c_long; "
14141414
"print(libc.time(0))"]
14151415
)
@@ -1448,12 +1448,12 @@ def test_proc_stat_btime_virtualized(self):
14481448
btime = int(result.stdout.strip().split()[1])
14491449
assert btime == 946684800, f"btime not virtualized: {btime}"
14501450

1451-
@pytest.mark.xfail(reason="vDSO patching via /proc/pid/mem is racy for exec'd processes")
1451+
14521452
def test_timerfd_abstime_works(self):
14531453
"""timerfd_settime with TFD_TIMER_ABSTIME fires correctly."""
14541454
code = (
1455-
"import ctypes, ctypes.util, struct, os, time\n"
1456-
"libc = ctypes.CDLL(ctypes.util.find_library('c'))\n"
1455+
"import ctypes, struct, os, time\n"
1456+
"libc = ctypes.CDLL(None)\n"
14571457
"CLOCK_MONOTONIC = 1\n"
14581458
"TFD_TIMER_ABSTIME = 1\n"
14591459
"fd = libc.timerfd_create(CLOCK_MONOTONIC, 0)\n"

0 commit comments

Comments
 (0)