Skip to content

Commit bf8690b

Browse files
committed
Harden /proc/self oom and fdinfo nodes
procfs emulation now treats the OOM trio (oom_score_adj, legacy oom_adj, read-only oom_score) as one process-wide adjustment with per-path read and write semantics: legacy oom_adj scales to oom_score_adj on writes (special-casing OOM_DISABLE -> SCORE_ADJ_MIN and OOM_ADJUST_MAX -> SCORE_ADJ_MAX so the boundary intent survives the lossy multiply) and back-clamps to [-17, 15] on reads; oom_score is read-only with a stub zero. The OOM write path serializes the truncate+pwrite+lseek under a new oom_write_lock and publishes the global atomic only after the backing rewrite succeeds, so a partial-rewrite failure no longer leaves the process-wide value diverged from a returned -1. Zero-length writes short-circuit to success (matches Linux for proc nodes; sys_writev previously hit -EINVAL in the parser). Stat reports st_size 0 for every synthetic /proc file so callers that pre-size buffers from stat cannot truncate (a 256-byte cap had silently chopped /proc/cpuinfo on hosts with many CPUs; a 2-byte cap had reduced -1000 to -1 on oom_score_adj). A new read-intercept path mirrors the write side. proc_intercept_read and proc_intercept_readv let read/pread/readv/preadv on the OOM nodes return the live atomic value rather than the per-open temp file content, and sendfile/copy_file_range route through the same hook so proc-source byte counts stay consistent with the value an immediately following open would observe. /proc/self/fdinfo gains type-specific lines for the special fd classes elfuse implements: eventfd-count (16-char hex matching fs/eventfd.c), sigmask (16-char hex), and timerfd clockid/ticks/it_value/it_interval. The accessors live in src/syscall/fd.c (eventfd_fdinfo_snapshot, signalfd_fdinfo_snapshot, timerfd_fdinfo_snapshot) and read state under sfd_lock to prevent tearing across concurrent read/write/settime. The per-fd lseek probe now uses fd_to_host_dup so a concurrent close+reopen on another vCPU cannot redirect the probe to an unrelated host fd, and errno is saved/restored across the ESPIPE-prone lseek so non-seekable fds (sockets, pipes) do not pollute the caller's state. /proc/self/fdinfo and /proc/self/fd no longer share one static backing directory across opens. The previous design let a second open unlink and recreate entries while a sibling thread iterated its dirfd; both nodes now go through proc_open_fd_scratch, which mkdtemps a private directory per open, populates it from a fresh fd-table snapshot, and tracks the path in proc_scratch_dirs[] for atexit cleanup so the previously-leaked backing dirs are reaped at process exit. The unix-net visitor's buffer-tail margin grew from 128 to 256 bytes to fit the longest possible row (54 fixed + 108 sun_path + newline); the previous margin let the snprintf truncate the path and drop the trailing newline. Eight explicit /proc/<pid>/X cases collapsed into one general alias-and-recurse, so /proc/<our_pid>/maps, /oom_score_adj, /limits, etc. now route through the matching /proc/self handler. Locked in by tests/test-tier-b.c (35 cases including oom write persistence, out-of-range -EINVAL, oom_adj=15 -> 1000 scaling, oom_score read-only and write-rejected, zero-length writev, stat-size-zero, fdinfo eventfd-count hex, fdinfo sigmask, fdinfo timerfd next expiry for periodic timers, concurrent fdinfo enumeration, and a /proc/net/tcp sl-density regression that opens non-TCP sockets before TCP listeners so the iterator visits rejected sockets first; the post-fix dense sl=0,1,... output matches qemu Linux ground truth, and a manual bug reintroduction confirms the test catches the sparse-slot regression with sl=4 expected=0). tests/test-io-opt.c adds sendfile and copy_file_range coverage for the read-intercept path.
1 parent 34c741f commit bf8690b

8 files changed

Lines changed: 1887 additions & 615 deletions

File tree

src/runtime/procemu.c

Lines changed: 990 additions & 590 deletions
Large diffs are not rendered by default.

src/runtime/procemu.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include <stddef.h>
1414
#include <sys/stat.h>
15+
#include <sys/uio.h>
1516
#include "core/guest.h"
1617

1718
/* Sentinel return value: path was not intercepted, caller should fall through
@@ -53,6 +54,24 @@ int proc_intercept_write(int guest_fd,
5354
int use_pwrite,
5455
ssize_t *written_out);
5556

57+
/* Intercept reads from synthetic proc files that must reflect shared state on
58+
* every read rather than the per-open temp-file snapshot.
59+
* Returns 1 if handled (with *read_out set), 0 if not intercepted, or -1 on
60+
* error with errno set.
61+
*/
62+
int proc_intercept_read(int guest_fd,
63+
void *buf,
64+
size_t count,
65+
int64_t offset,
66+
ssize_t *read_out);
67+
68+
/* Vector form of proc_intercept_read for readv/preadv. */
69+
int proc_intercept_readv(int guest_fd,
70+
const struct iovec *iov,
71+
int iovcnt,
72+
int64_t offset,
73+
ssize_t *read_out);
74+
5675
/* Get the /dev/shm emulation directory path (creating it on first call).
5776
* Used by sys_unlinkat to rewrite /dev/shm/<name> paths.
5877
*/

src/syscall/fd.c

Lines changed: 88 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,33 @@ static int timerfd_alloc(void)
116116
return sfd_alloc_slot(timerfd_state, TIMERFD_MAX, sizeof(timerfd_state[0]));
117117
}
118118

119+
/* Called with sfd_lock held. Returns nanoseconds until the next expiration,
120+
* or 0 when the timer is disarmed or a one-shot timer has already expired.
121+
*/
122+
static int64_t timerfd_remaining_ns_locked(int slot, int64_t now_ns)
123+
{
124+
if (!timerfd_state[slot].armed)
125+
return 0;
126+
127+
int64_t elapsed = now_ns - timerfd_state[slot].arm_time_ns;
128+
if (elapsed < 0)
129+
elapsed = 0;
130+
131+
if (timerfd_state[slot].interval_ns > 0) {
132+
int64_t total = timerfd_state[slot].initial_ns;
133+
if (elapsed >= total) {
134+
int64_t since_first = elapsed - total;
135+
int64_t interval = timerfd_state[slot].interval_ns;
136+
int64_t remaining = interval - (since_first % interval);
137+
return remaining == 0 ? interval : remaining;
138+
}
139+
return total - elapsed;
140+
}
141+
142+
int64_t remaining = timerfd_state[slot].initial_ns - elapsed;
143+
return remaining > 0 ? remaining : 0;
144+
}
145+
119146
int64_t sys_timerfd_create(int clockid, int flags)
120147
{
121148
if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC)
@@ -203,8 +230,7 @@ int64_t sys_timerfd_settime(guest_t *g,
203230
struct timespec now;
204231
clock_gettime(CLOCK_MONOTONIC, &now);
205232
int64_t now_ns = now.tv_sec * NS_PER_SEC + now.tv_nsec;
206-
int64_t elapsed = now_ns - timerfd_state[slot].arm_time_ns;
207-
int64_t remaining = timerfd_state[slot].initial_ns - elapsed;
233+
int64_t remaining = timerfd_remaining_ns_locked(slot, now_ns);
208234
if (remaining > 0) {
209235
old.it_value_sec = remaining / NS_PER_SEC;
210236
old.it_value_nsec = remaining % NS_PER_SEC;
@@ -319,27 +345,10 @@ int64_t sys_timerfd_gettime(guest_t *g, int fd, uint64_t curr_value_gva)
319345
its.it_interval_sec = timerfd_state[slot].interval_ns / NS_PER_SEC;
320346
its.it_interval_nsec = timerfd_state[slot].interval_ns % NS_PER_SEC;
321347

322-
/* Compute actual remaining time from arm time + initial value */
323348
struct timespec now;
324349
clock_gettime(CLOCK_MONOTONIC, &now);
325350
int64_t now_ns = now.tv_sec * NS_PER_SEC + now.tv_nsec;
326-
int64_t elapsed = now_ns - timerfd_state[slot].arm_time_ns;
327-
int64_t remaining;
328-
329-
if (timerfd_state[slot].interval_ns > 0) {
330-
/* Repeating timer: remaining = interval - (elapsed % interval) */
331-
int64_t total = timerfd_state[slot].initial_ns;
332-
if (elapsed >= total) {
333-
int64_t since_first = elapsed - total;
334-
remaining = timerfd_state[slot].interval_ns -
335-
(since_first % timerfd_state[slot].interval_ns);
336-
} else {
337-
remaining = total - elapsed;
338-
}
339-
} else {
340-
/* One-shot: remaining = initial - elapsed */
341-
remaining = timerfd_state[slot].initial_ns - elapsed;
342-
}
351+
int64_t remaining = timerfd_remaining_ns_locked(slot, now_ns);
343352

344353
if (remaining <= 0) {
345354
/* Timer already expired (one-shot) */
@@ -1073,3 +1082,62 @@ void signalfd_notify(int signum)
10731082
}
10741083
pthread_mutex_unlock(&sfd_lock);
10751084
}
1085+
1086+
/* /proc/self/fdinfo type-specific snapshots. Each takes sfd_lock to prevent
1087+
* tearing across concurrent read/write/settime; lock order is fd_lock(3)
1088+
* -> sfd_lock(5a), and these accessors take only sfd_lock so the procemu
1089+
* caller is free to drop fd_lock between fd_snapshot and the lookup here.
1090+
*/
1091+
1092+
bool eventfd_fdinfo_snapshot(int guest_fd, uint64_t *count_out)
1093+
{
1094+
pthread_mutex_lock(&sfd_lock);
1095+
int slot = eventfd_find(guest_fd);
1096+
if (slot < 0) {
1097+
pthread_mutex_unlock(&sfd_lock);
1098+
return false;
1099+
}
1100+
*count_out = eventfd_state[slot].counter;
1101+
pthread_mutex_unlock(&sfd_lock);
1102+
return true;
1103+
}
1104+
1105+
bool signalfd_fdinfo_snapshot(int guest_fd, uint64_t *mask_out)
1106+
{
1107+
pthread_mutex_lock(&sfd_lock);
1108+
int slot = signalfd_find(guest_fd);
1109+
if (slot < 0) {
1110+
pthread_mutex_unlock(&sfd_lock);
1111+
return false;
1112+
}
1113+
*mask_out = signalfd_state[slot].mask;
1114+
pthread_mutex_unlock(&sfd_lock);
1115+
return true;
1116+
}
1117+
1118+
bool timerfd_fdinfo_snapshot(int guest_fd,
1119+
int *clockid_out,
1120+
uint64_t *ticks_out,
1121+
int64_t *value_ns_out,
1122+
int64_t *interval_ns_out)
1123+
{
1124+
pthread_mutex_lock(&sfd_lock);
1125+
int slot = timerfd_find(guest_fd);
1126+
if (slot < 0) {
1127+
pthread_mutex_unlock(&sfd_lock);
1128+
return false;
1129+
}
1130+
*clockid_out = timerfd_state[slot].clockid;
1131+
*ticks_out = timerfd_state[slot].expirations;
1132+
*interval_ns_out = timerfd_state[slot].interval_ns;
1133+
int64_t value_ns = 0;
1134+
if (timerfd_state[slot].armed) {
1135+
struct timespec now;
1136+
clock_gettime(CLOCK_MONOTONIC, &now);
1137+
int64_t now_ns = (int64_t) now.tv_sec * NS_PER_SEC + now.tv_nsec;
1138+
value_ns = timerfd_remaining_ns_locked(slot, now_ns);
1139+
}
1140+
*value_ns_out = value_ns;
1141+
pthread_mutex_unlock(&sfd_lock);
1142+
return true;
1143+
}

src/syscall/fd.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,15 @@ int64_t timerfd_read(int guest_fd,
6666
* writes a byte to make poll/epoll see readability.
6767
*/
6868
void signalfd_notify(int signum);
69+
70+
/* Snapshot per-fd state for /proc/self/fdinfo. Each accessor returns true when
71+
* the guest_fd refers to a live instance of that special-fd type. The values
72+
* are read under sfd_lock so concurrent read/write/settime cannot tear them.
73+
*/
74+
bool eventfd_fdinfo_snapshot(int guest_fd, uint64_t *count_out);
75+
bool signalfd_fdinfo_snapshot(int guest_fd, uint64_t *mask_out);
76+
bool timerfd_fdinfo_snapshot(int guest_fd,
77+
int *clockid_out,
78+
uint64_t *ticks_out,
79+
int64_t *value_ns_out,
80+
int64_t *interval_ns_out);

src/syscall/fs.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ static const char *proc_stateful_file_path(const char *path)
7070
return NULL;
7171

7272
if (!strcmp(path, "/proc/self/oom_score_adj") ||
73-
!strcmp(path, "/proc/self/oom_adj")) {
73+
!strcmp(path, "/proc/self/oom_adj") ||
74+
!strcmp(path, "/proc/self/oom_score")) {
7475
return path;
7576
}
7677

@@ -86,6 +87,8 @@ static const char *proc_stateful_file_path(const char *path)
8687
return "/proc/self/oom_score_adj";
8788
if (!strcmp(endp, "/oom_adj"))
8889
return "/proc/self/oom_adj";
90+
if (!strcmp(endp, "/oom_score"))
91+
return "/proc/self/oom_score";
8992

9093
return NULL;
9194
}

src/syscall/io.c

Lines changed: 124 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,68 @@ static int64_t host_fd_ref_open_regular_io(int guest_fd, host_fd_ref_t *ref)
479479
return host_fd_ref_open_io(guest_fd, ref);
480480
}
481481

482+
static int64_t proc_try_read_intercept(int fd,
483+
int host_fd,
484+
void *buf,
485+
size_t count,
486+
int64_t offset,
487+
int use_pread)
488+
{
489+
ssize_t intercepted = 0;
490+
int handled = proc_intercept_read(fd, buf, count, offset, &intercepted);
491+
if (handled < 0)
492+
return linux_errno();
493+
if (handled > 0) {
494+
if (!use_pread &&
495+
lseek(host_fd, offset + (int64_t) intercepted, SEEK_SET) < 0)
496+
return linux_errno();
497+
return intercepted;
498+
}
499+
return INT64_MIN;
500+
}
501+
502+
static int64_t proc_try_readv_intercept(int fd,
503+
int host_fd,
504+
const struct iovec *iov,
505+
int iovcnt,
506+
int64_t offset,
507+
int use_pread)
508+
{
509+
ssize_t intercepted = 0;
510+
int handled = proc_intercept_readv(fd, iov, iovcnt, offset, &intercepted);
511+
if (handled < 0)
512+
return linux_errno();
513+
if (handled > 0) {
514+
if (!use_pread &&
515+
lseek(host_fd, offset + (int64_t) intercepted, SEEK_SET) < 0)
516+
return linux_errno();
517+
return intercepted;
518+
}
519+
return INT64_MIN;
520+
}
521+
522+
static int64_t proc_try_copy_read_intercept(int fd,
523+
int host_fd,
524+
void *buf,
525+
size_t count,
526+
int64_t *offset_io,
527+
int use_pread)
528+
{
529+
int64_t offset = *offset_io;
530+
531+
if (!use_pread) {
532+
offset = lseek(host_fd, 0, SEEK_CUR);
533+
if (offset < 0)
534+
return INT64_MIN;
535+
}
536+
537+
int64_t intercepted =
538+
proc_try_read_intercept(fd, host_fd, buf, count, offset, use_pread);
539+
if (intercepted == INT64_MIN)
540+
return INT64_MIN;
541+
return intercepted;
542+
}
543+
482544
static int64_t proc_try_writev_intercept(int fd,
483545
int host_fd,
484546
const struct iovec *iov,
@@ -613,6 +675,16 @@ int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count)
613675
if (count > avail)
614676
count = avail;
615677

678+
off_t offset = lseek(host_ref.fd, 0, SEEK_CUR);
679+
if (offset >= 0) {
680+
int64_t intercepted =
681+
proc_try_read_intercept(fd, host_ref.fd, buf, count, offset, 0);
682+
if (intercepted != INT64_MIN) {
683+
host_fd_ref_close(&host_ref);
684+
return intercepted;
685+
}
686+
}
687+
616688
ssize_t ret = read(host_ref.fd, buf, count);
617689
host_fd_ref_close(&host_ref);
618690
return ret < 0 ? linux_errno() : ret;
@@ -642,6 +714,13 @@ int64_t sys_pread64(guest_t *g,
642714
if (count > avail)
643715
count = avail;
644716

717+
int64_t intercepted =
718+
proc_try_read_intercept(fd, host_ref.fd, buf, count, offset, 1);
719+
if (intercepted != INT64_MIN) {
720+
host_fd_ref_close(&host_ref);
721+
return intercepted;
722+
}
723+
645724
ssize_t ret = pread(host_ref.fd, buf, count, offset);
646725
host_fd_ref_close(&host_ref);
647726
return ret < 0 ? linux_errno() : ret;
@@ -832,6 +911,17 @@ int64_t sys_readv(guest_t *g, int fd, uint64_t iov_gva, int iovcnt)
832911
return err;
833912
}
834913

914+
off_t offset = lseek(host_ref.fd, 0, SEEK_CUR);
915+
if (offset >= 0) {
916+
int64_t intercepted = proc_try_readv_intercept(
917+
fd, host_ref.fd, host_iov.iov, iovcnt, offset, 0);
918+
if (intercepted != INT64_MIN) {
919+
host_iov_free(&host_iov);
920+
host_fd_ref_close(&host_ref);
921+
return intercepted;
922+
}
923+
}
924+
835925
ssize_t ret = readv(host_ref.fd, host_iov.iov, iovcnt);
836926
int64_t result = ret < 0 ? linux_errno() : ret;
837927
host_iov_free(&host_iov);
@@ -919,6 +1009,14 @@ int64_t sys_preadv(guest_t *g,
9191009
return err;
9201010
}
9211011

1012+
int64_t intercepted = proc_try_readv_intercept(
1013+
fd, host_ref.fd, host_iov.iov, iovcnt, offset, 1);
1014+
if (intercepted != INT64_MIN) {
1015+
host_iov_free(&host_iov);
1016+
host_fd_ref_close(&host_ref);
1017+
return intercepted;
1018+
}
1019+
9221020
ssize_t ret = preadv(host_ref.fd, host_iov.iov, iovcnt, offset);
9231021
int64_t result = ret < 0 ? linux_errno() : ret;
9241022
host_iov_free(&host_iov);
@@ -1354,9 +1452,20 @@ int64_t sys_sendfile(guest_t *g,
13541452
size_t chunk = remaining > sizeof(buf) ? sizeof(buf) : remaining;
13551453
ssize_t nr;
13561454
if (offset >= 0) {
1357-
nr = pread(in_ref.fd, buf, chunk, offset);
1455+
int64_t intercepted = proc_try_copy_read_intercept(
1456+
in_fd, in_ref.fd, buf, chunk, &offset, 1);
1457+
if (intercepted != INT64_MIN)
1458+
nr = intercepted;
1459+
else
1460+
nr = pread(in_ref.fd, buf, chunk, offset);
13581461
} else {
1359-
nr = read(in_ref.fd, buf, chunk);
1462+
int64_t current = 0;
1463+
int64_t intercepted = proc_try_copy_read_intercept(
1464+
in_fd, in_ref.fd, buf, chunk, &current, 0);
1465+
if (intercepted != INT64_MIN)
1466+
nr = intercepted;
1467+
else
1468+
nr = read(in_ref.fd, buf, chunk);
13601469
}
13611470
if (nr < 0) {
13621471
if (total > 0)
@@ -1443,9 +1552,20 @@ int64_t sys_copy_file_range(guest_t *g,
14431552
size_t chunk = remaining > sizeof(buf) ? sizeof(buf) : remaining;
14441553
ssize_t nr;
14451554
if (off_in >= 0) {
1446-
nr = pread(in_ref.fd, buf, chunk, off_in);
1555+
int64_t intercepted = proc_try_copy_read_intercept(
1556+
fd_in, in_ref.fd, buf, chunk, &off_in, 1);
1557+
if (intercepted != INT64_MIN)
1558+
nr = intercepted;
1559+
else
1560+
nr = pread(in_ref.fd, buf, chunk, off_in);
14471561
} else {
1448-
nr = read(in_ref.fd, buf, chunk);
1562+
int64_t current = 0;
1563+
int64_t intercepted = proc_try_copy_read_intercept(
1564+
fd_in, in_ref.fd, buf, chunk, &current, 0);
1565+
if (intercepted != INT64_MIN)
1566+
nr = intercepted;
1567+
else
1568+
nr = read(in_ref.fd, buf, chunk);
14491569
}
14501570
if (nr < 0) {
14511571
if (total > 0)

0 commit comments

Comments
 (0)