Skip to content

Commit 27201d9

Browse files
committed
Filter /proc readdir to hide foreign PIDs via getdents64 interception
Signed-off-by: Cong Wang <cwang@multikernel.io>
1 parent 50d5eb9 commit 27201d9

File tree

5 files changed

+253
-22
lines changed

5 files changed

+253
-22
lines changed

src/sandlock/_context.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def _pidfd_poll(pidfd: int, timeout_s: float) -> bool:
8080

8181
# --- Syscalls to intercept for notification ---
8282

83-
def _notif_syscall_names(policy: "Policy") -> list[str]:
83+
def _notif_syscall_names(notif: "NotifPolicy") -> list[str]:
8484
"""Return the list of syscalls to intercept via user notification.
8585
8686
openat is always intercepted. open is added on x86_64 (not
@@ -91,13 +91,16 @@ def _notif_syscall_names(policy: "Policy") -> list[str]:
9191
names = ["openat"]
9292
if "open" in _SYSCALL_NR:
9393
names.append("open")
94-
notif = policy.notif_policy
9594
if notif is not None and notif.allowed_ips:
9695
names.extend(["connect", "sendto"])
9796
if notif is not None and notif.max_memory_bytes > 0:
9897
names.extend(["mmap", "munmap", "brk", "mremap"])
9998
if notif is not None and notif.max_processes > 0:
10099
names.extend(["clone", "fork", "vfork"])
100+
if notif is not None and notif.isolate_pids:
101+
names.append("getdents64")
102+
if "getdents" in _SYSCALL_NR:
103+
names.append("getdents")
101104
# Deduplicate (clone/open may already be in the list)
102105
return list(dict.fromkeys(names))
103106

@@ -298,8 +301,18 @@ def _close_pidfd(self) -> None:
298301
self._control_fd = -1
299302

300303
def __enter__(self) -> "SandboxContext":
301-
notif_policy = self._policy.notif_policy
302-
use_notif = notif_policy is not None
304+
# Auto-enable /proc PID isolation when /proc is readable
305+
self._notif_policy = self._policy.notif_policy
306+
if self._notif_policy is None and any(
307+
p == "/proc" or p.rstrip("/") == "/proc"
308+
for p in self._policy.fs_readable
309+
):
310+
from ._notif_policy import NotifPolicy, default_proc_rules
311+
self._notif_policy = NotifPolicy(
312+
rules=default_proc_rules(),
313+
isolate_pids=True,
314+
)
315+
use_notif = self._notif_policy is not None
303316

304317
# Pre-import modules used in the child BEFORE fork — the child's
305318
# Landlock policy won't include the sandlock source directory, so
@@ -442,7 +455,7 @@ def __enter__(self) -> "SandboxContext":
442455
from ._landlock import _set_no_new_privs
443456
_set_no_new_privs()
444457
notify_fd = install_notif_filter(
445-
_notif_syscall_names(self._policy),
458+
_notif_syscall_names(self._notif_policy),
446459
deny_syscalls=deny,
447460
allow_syscalls=allow,
448461
)
@@ -547,7 +560,7 @@ def __enter__(self) -> "SandboxContext":
547560
parent_sock.close()
548561
pids_fn = lambda pgid=pid: _pids_by_pgid(pgid) # noqa: E731
549562
self._supervisor = NotifSupervisor(
550-
notify_fd, pid, notif_policy,
563+
notify_fd, pid, self._notif_policy,
551564
pids_fn=pids_fn,
552565
)
553566
self._supervisor.start()

src/sandlock/_notif.py

Lines changed: 159 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from .exceptions import NotifError
2626
from ._notif_policy import NotifAction, NotifPolicy
27-
from ._procfs import read_bytes, resolve_openat_path
27+
from ._procfs import read_bytes, write_bytes, resolve_openat_path
2828
from ._seccomp import (
2929
AUDIT_ARCH,
3030
BPF_ABS,
@@ -355,6 +355,72 @@ def _parse_msghdr_dest_ip(pid: int, msghdr_addr: int) -> str | None:
355355
return _parse_dest_ip(pid, name_addr, name_len)
356356

357357

358+
# --- getdents64 helpers ---
359+
360+
def _build_dirent64(d_ino: int, d_off: int, d_type: int, name: str) -> bytes:
361+
"""Build a single linux_dirent64 entry.
362+
363+
struct linux_dirent64 {
364+
u64 d_ino; // 0
365+
s64 d_off; // 8
366+
u16 d_reclen; // 16
367+
u8 d_type; // 18
368+
char d_name[]; // 19+
369+
};
370+
d_reclen is 8-byte aligned.
371+
"""
372+
name_bytes = name.encode("utf-8") + b"\0"
373+
# 19 bytes header + name + padding to 8-byte alignment
374+
reclen = 19 + len(name_bytes)
375+
reclen = (reclen + 7) & ~7 # align to 8
376+
buf = bytearray(reclen)
377+
struct.pack_into("QqHB", buf, 0, d_ino, d_off, reclen, d_type)
378+
buf[19:19 + len(name_bytes)] = name_bytes
379+
return bytes(buf)
380+
381+
382+
def _build_filtered_dirents(sandbox_pids: set[int]) -> list[bytes]:
383+
"""Build a list of dirent64 entries for /proc, filtering out foreign PIDs.
384+
385+
Reads the real /proc directory in the supervisor process and builds
386+
synthetic dirent64 entries, excluding PID directories not in sandbox_pids.
387+
"""
388+
DT_DIR = 4
389+
DT_REG = 8
390+
DT_LNK = 10
391+
entries = []
392+
d_off = 0
393+
try:
394+
with os.scandir("/proc") as it:
395+
for entry in it:
396+
name = entry.name
397+
# Filter out foreign PID directories
398+
if name.isdigit():
399+
if int(name) not in sandbox_pids:
400+
continue
401+
402+
d_off += 1
403+
try:
404+
if entry.is_dir(follow_symlinks=False):
405+
d_type = DT_DIR
406+
elif entry.is_symlink():
407+
d_type = DT_LNK
408+
else:
409+
d_type = DT_REG
410+
except OSError:
411+
d_type = DT_REG
412+
413+
try:
414+
d_ino = entry.inode()
415+
except OSError:
416+
d_ino = 0
417+
418+
entries.append(_build_dirent64(d_ino, d_off, d_type, name))
419+
except OSError:
420+
pass
421+
return entries
422+
423+
358424
# --- Notification supervisor ---
359425

360426
class NotifSupervisor:
@@ -384,6 +450,8 @@ def __init__(
384450
self._brk_base: dict[int, int] = {} # pid → last known brk
385451
self._proc_count: int = 1 # Start at 1 (the initial child)
386452
self._proc_pids: set[int] = {child_pid} # All known sandbox PIDs
453+
# getdents /proc filtering: fd → list of remaining dirent entries
454+
self._proc_dir_cache: dict[int, list[bytes]] = {}
387455

388456
def start(self) -> None:
389457
"""Start the supervisor thread."""
@@ -499,6 +567,14 @@ def _dispatch(self, notif: SeccompNotif) -> None:
499567
self._handle_net(notif, nr)
500568
return
501569

570+
# --- /proc readdir PID filtering ---
571+
nr_getdents64 = _SYSCALL_NR.get("getdents64")
572+
nr_getdents = _SYSCALL_NR.get("getdents")
573+
574+
if nr in (nr_getdents64, nr_getdents) and self._policy.isolate_pids:
575+
self._handle_getdents(notif)
576+
return
577+
502578
# --- Filesystem: open / openat virtualization ---
503579
nr_openat = _SYSCALL_NR.get("openat")
504580
nr_open = _SYSCALL_NR.get("open")
@@ -658,8 +734,77 @@ def _handle_fork(self, notif: SeccompNotif, nr: int) -> None:
658734
# The new child's PID is unknown until it makes its first
659735
# intercepted syscall — tracked lazily via _record_pid.
660736
self._proc_pids.add(notif.pid)
737+
# Invalidate /proc readdir cache so new PIDs appear
738+
self._proc_dir_cache.clear()
661739
self._respond_continue(notif.id)
662740

741+
def _handle_getdents(self, notif: SeccompNotif) -> None:
742+
"""Handle getdents64/getdents — filter /proc readdir to hide foreign PIDs.
743+
744+
On first call for a given fd, reads all /proc entries from the
745+
supervisor, filters out foreign PIDs, builds dirent64 entries,
746+
and caches them. Each call returns as many cached entries as fit
747+
in the child's buffer, then returns 0 when exhausted.
748+
"""
749+
pid = notif.pid
750+
child_fd_num = notif.data.args[0] & 0xFFFFFFFF
751+
buf_addr = notif.data.args[1]
752+
buf_size = notif.data.args[2] & 0xFFFFFFFF
753+
754+
# Check if the fd points to /proc
755+
try:
756+
target = os.readlink(f"/proc/{pid}/fd/{child_fd_num}")
757+
except OSError:
758+
self._respond_continue(notif.id)
759+
return
760+
761+
if target != "/proc":
762+
self._respond_continue(notif.id)
763+
return
764+
765+
# Build cache on first call for this fd
766+
cache_key = (pid, child_fd_num)
767+
if cache_key not in self._proc_dir_cache:
768+
sandbox_pids = None
769+
if self._pids_fn is not None:
770+
sandbox_pids = set(self._pids_fn())
771+
if sandbox_pids is None:
772+
self._respond_continue(notif.id)
773+
return
774+
775+
entries = _build_filtered_dirents(sandbox_pids)
776+
self._proc_dir_cache[cache_key] = entries
777+
778+
entries = self._proc_dir_cache[cache_key]
779+
780+
if not self._id_valid(notif.id):
781+
return
782+
783+
# Pack as many entries as fit into buf_size
784+
result = bytearray()
785+
consumed = 0
786+
for entry in entries:
787+
if len(result) + len(entry) > buf_size:
788+
break
789+
result.extend(entry)
790+
consumed += 1
791+
792+
# Remove consumed entries from cache
793+
if consumed > 0:
794+
self._proc_dir_cache[cache_key] = entries[consumed:]
795+
elif not entries:
796+
# All entries consumed — clean up cache
797+
del self._proc_dir_cache[cache_key]
798+
799+
# Write to child memory and return byte count
800+
try:
801+
if result:
802+
write_bytes(pid, buf_addr, bytes(result))
803+
self._respond_val(notif.id, len(result))
804+
except OSError:
805+
self._proc_dir_cache.pop(cache_key, None)
806+
self._respond_continue(notif.id)
807+
663808
def _id_valid(self, notif_id: int) -> bool:
664809
"""Check if a notification ID is still valid (TOCTTOU check)."""
665810
id_val = ctypes.c_uint64(notif_id)
@@ -683,6 +828,19 @@ def _respond_continue(self, notif_id: int) -> None:
683828
ctypes.byref(resp),
684829
)
685830

831+
def _respond_val(self, notif_id: int, val: int) -> None:
832+
"""Return a specific value as the syscall result."""
833+
resp = SeccompNotifResp()
834+
resp.id = notif_id
835+
resp.val = val
836+
resp.error = 0
837+
resp.flags = 0
838+
_libc.ioctl(
839+
ctypes.c_int(self._notify_fd),
840+
ctypes.c_ulong(SECCOMP_IOCTL_NOTIF_SEND),
841+
ctypes.byref(resp),
842+
)
843+
686844
def _respond_errno(self, notif_id: int, errno_code: int) -> None:
687845
"""Deny the syscall with the given errno."""
688846
resp = SeccompNotifResp()

src/sandlock/_procfs.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,24 @@ def read_bytes(pid: int, addr: int, length: int) -> bytes:
5656
os.close(fd)
5757

5858

59+
def write_bytes(pid: int, addr: int, data: bytes) -> None:
60+
"""Write raw bytes to a child process's memory.
61+
62+
Args:
63+
pid: Target process ID.
64+
addr: Virtual address in the target's address space.
65+
data: Bytes to write.
66+
67+
Raises:
68+
OSError: If /proc/<pid>/mem cannot be written.
69+
"""
70+
fd = os.open(f"/proc/{pid}/mem", os.O_WRONLY)
71+
try:
72+
os.pwrite(fd, data, addr)
73+
finally:
74+
os.close(fd)
75+
76+
5977
def resolve_openat_path(pid: int, dirfd: int, pathname_addr: int) -> str:
6078
"""Resolve the full path for an openat(dirfd, pathname, ...) call.
6179

src/sandlock/cli.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,21 +76,6 @@ def cmd_run(args: argparse.Namespace) -> int:
7676
else:
7777
policy = Policy(**cli_kwargs)
7878

79-
# Auto-enable /proc pid isolation when /proc is readable
80-
readable = list(policy.fs_readable)
81-
if readable and any(
82-
p == "/proc" or p.rstrip("/") == "/proc" for p in readable
83-
):
84-
from ._notif_policy import NotifPolicy, default_proc_rules
85-
if policy.notif_policy is None:
86-
import dataclasses
87-
policy = dataclasses.replace(
88-
policy,
89-
notif_policy=NotifPolicy(
90-
rules=default_proc_rules(),
91-
isolate_pids=True,
92-
),
93-
)
9479
sb = Sandbox(policy)
9580

9681
if args.interactive:

tests/test_integration.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,63 @@ def test_proc_auto_isolation_blocks_foreign_pid(self):
694694
assert result.success
695695
assert b"HIDDEN" in result.stdout
696696

697+
def test_getdents_hides_foreign_pids(self):
698+
"""readdir(/proc) should only show sandbox PIDs when isolate_pids=True."""
699+
from sandlock._notif_policy import NotifPolicy, default_proc_rules
700+
701+
policy = Policy(
702+
fs_readable=_PYTHON_READABLE,
703+
notif_policy=NotifPolicy(
704+
rules=default_proc_rules(),
705+
isolate_pids=True,
706+
),
707+
)
708+
result = Sandbox(policy).run(
709+
["python3", "-c", """
710+
import os
711+
# List /proc and collect numeric (PID) entries
712+
pids = [e for e in os.listdir('/proc') if e.isdigit()]
713+
my_pid = str(os.getpid())
714+
# Our own PID should be visible
715+
if my_pid in pids:
716+
print(f'SELF_VISIBLE')
717+
else:
718+
print(f'SELF_HIDDEN')
719+
# PID 1 (init) should NOT be visible
720+
if '1' in pids:
721+
print('INIT_VISIBLE')
722+
else:
723+
print('INIT_HIDDEN')
724+
print(f'PID_COUNT={len(pids)}')
725+
"""]
726+
)
727+
assert result.success
728+
assert b"INIT_HIDDEN" in result.stdout
729+
# Should have very few PIDs (just the sandbox's own)
730+
for line in result.stdout.decode().splitlines():
731+
if line.startswith("PID_COUNT="):
732+
count = int(line.split("=")[1])
733+
assert count < 10, f"Too many PIDs visible: {count}"
734+
735+
def test_always_isolates_when_proc_readable(self):
736+
"""PID isolation is always on when /proc is in fs_readable."""
737+
policy = Policy(
738+
fs_readable=_PYTHON_READABLE,
739+
)
740+
result = Sandbox(policy).run(
741+
["python3", "-c", """
742+
import os
743+
pids = [e for e in os.listdir('/proc') if e.isdigit()]
744+
if '1' in pids:
745+
print('INIT_VISIBLE')
746+
else:
747+
print('INIT_HIDDEN')
748+
print(f'PID_COUNT={len(pids)}')
749+
"""]
750+
)
751+
assert result.success
752+
assert b"INIT_HIDDEN" in result.stdout
753+
697754

698755
# --- Resource limits (seccomp notif based) ---
699756

0 commit comments

Comments
 (0)