Skip to content

Commit fc95a57

Browse files
committed
feat: support bottlerocket
1 parent 710e9a7 commit fc95a57

File tree

13 files changed

+644
-25
lines changed

13 files changed

+644
-25
lines changed

compel/arch/arm/plugins/std/syscalls/syscall.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *
9292
signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
9393
rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
9494
vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
95+
pipe2 59 359 (int *fildes, int flags)
9596
timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
9697
fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags)
9798
fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname)

compel/arch/x86/plugins/std/syscalls/syscall_64.tbl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ __NR_vmsplice 278 sys_vmsplice (int fd, const struct iovec *iov, unsigned lo
9797
__NR_fallocate 285 sys_fallocate (int fd, int mode, loff_t offset, loff_t len)
9898
__NR_timerfd_settime 286 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
9999
__NR_signalfd4 289 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
100+
__NR_pipe2 293 sys_pipe2 (int *fildes, int flags)
100101
__NR_preadv 295 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
101102
__NR_rt_tgsigqueueinfo 297 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
102103
__NR_fanotify_init 300 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags)
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <fcntl.h>
4+
#include <unistd.h>
5+
#include <string.h>
6+
#include <errno.h>
7+
8+
int main(int argc, char *argv[]) {
9+
if (argc != 2) {
10+
fprintf(stderr, "Usage: %s <pid>\n", argv[0]);
11+
fprintf(stderr, "Writes '4' to /proc/<pid>/clear_refs to clear soft-dirty bits\n");
12+
return 1;
13+
}
14+
15+
char path[64];
16+
snprintf(path, sizeof(path), "/proc/%s/clear_refs", argv[1]);
17+
18+
int fd = open(path, O_WRONLY);
19+
if (fd < 0) {
20+
fprintf(stderr, "Failed to open %s: %s\n", path, strerror(errno));
21+
return 1;
22+
}
23+
24+
const char *val = "4\n";
25+
ssize_t written = write(fd, val, 2);
26+
if (written < 0) {
27+
fprintf(stderr, "Failed to write to %s: %s\n", path, strerror(errno));
28+
close(fd);
29+
return 1;
30+
}
31+
32+
close(fd);
33+
return 0;
34+
}
35+
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
// clear-refs-exec.go
2+
//
3+
// A utility to clear soft-dirty bits for a containerized process by executing
4+
// clear_refs inside the container via crictl exec.
5+
//
6+
// Build: CGO_ENABLED=0 go build -ldflags="-s -w" -o clear-refs-exec clear-refs-exec.go
7+
8+
package main
9+
10+
import (
11+
"bufio"
12+
"flag"
13+
"fmt"
14+
"os"
15+
"os/exec"
16+
"path/filepath"
17+
"strconv"
18+
"strings"
19+
)
20+
21+
func main() {
22+
// Parse command-line flags
23+
clearRefsBin := flag.String("clear-refs-bin", "/bin/clear_refs", "path to clear_refs binary inside container")
24+
flag.Usage = func() {
25+
fmt.Fprintf(os.Stderr, "Usage: %s [options] <root-ns-pid>\n\n", os.Args[0])
26+
fmt.Fprintf(os.Stderr, "Clears soft-dirty bits for a container process via crictl exec.\n\n")
27+
fmt.Fprintf(os.Stderr, "Options:\n")
28+
flag.PrintDefaults()
29+
}
30+
flag.Parse()
31+
32+
if flag.NArg() != 1 {
33+
flag.Usage()
34+
os.Exit(1)
35+
}
36+
37+
pid := flag.Arg(0)
38+
39+
// Validate PID is a number
40+
if _, err := strconv.Atoi(pid); err != nil {
41+
fmt.Fprintf(os.Stderr, "error: invalid PID %q: %v\n", pid, err)
42+
os.Exit(1)
43+
}
44+
45+
// Extract container ID from cgroup
46+
containerID, err := getContainerID(pid)
47+
if err != nil {
48+
fmt.Fprintf(os.Stderr, "error: failed to get container ID: %v\n", err)
49+
os.Exit(1)
50+
}
51+
52+
// Get the NSpid (PID in container namespace)
53+
nspid, err := getNsPid(pid)
54+
if err != nil {
55+
fmt.Fprintf(os.Stderr, "error: failed to get NSpid: %v\n", err)
56+
os.Exit(1)
57+
}
58+
59+
fmt.Printf("Container ID: %s\n", containerID)
60+
fmt.Printf("NSpid: %s\n", nspid)
61+
62+
// Execute clear_refs inside the container via crictl exec
63+
if err := execClearRefs(containerID, *clearRefsBin, nspid); err != nil {
64+
fmt.Fprintf(os.Stderr, "error: failed to execute clear_refs: %v\n", err)
65+
os.Exit(1)
66+
}
67+
68+
fmt.Println("Successfully cleared soft-dirty bits")
69+
}
70+
71+
// getContainerID reads /proc/<pid>/cgroup and extracts the container ID.
72+
// Handles both standard kubepods format and cri-containerd scope format.
73+
func getContainerID(pid string) (string, error) {
74+
cgroupPath := filepath.Join("/proc", pid, "cgroup")
75+
file, err := os.Open(cgroupPath)
76+
if err != nil {
77+
return "", fmt.Errorf("failed to open %s: %w", cgroupPath, err)
78+
}
79+
defer file.Close()
80+
81+
scanner := bufio.NewScanner(file)
82+
for scanner.Scan() {
83+
line := scanner.Text()
84+
// cgroup line format: hierarchy-ID:controller-list:cgroup-path
85+
// Example: 0::/kubepods/besteffort/pod<uuid>/<container_id>
86+
parts := strings.SplitN(line, ":", 3)
87+
if len(parts) < 3 {
88+
continue
89+
}
90+
91+
cgroupPathPart := parts[2]
92+
if cgroupPathPart == "" || cgroupPathPart == "/" {
93+
continue
94+
}
95+
96+
// Try to extract container ID from the cgroup path
97+
containerID := extractContainerID(cgroupPathPart)
98+
if containerID != "" {
99+
return containerID, nil
100+
}
101+
}
102+
103+
if err := scanner.Err(); err != nil {
104+
return "", fmt.Errorf("failed to read %s: %w", cgroupPath, err)
105+
}
106+
107+
return "", fmt.Errorf("could not find container ID in %s", cgroupPath)
108+
}
109+
110+
// extractContainerID extracts the container ID from a cgroup path.
111+
// Handles formats like:
112+
// - /kubepods/besteffort/pod<uuid>/<container_id>
113+
// - /kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod<uuid>.slice/cri-containerd-<container_id>.scope
114+
func extractContainerID(cgroupPath string) string {
115+
// Get the last component of the path
116+
lastComponent := filepath.Base(cgroupPath)
117+
118+
// Handle cri-containerd scope format: cri-containerd-<container_id>.scope
119+
if strings.HasPrefix(lastComponent, "cri-containerd-") && strings.HasSuffix(lastComponent, ".scope") {
120+
// Remove "cri-containerd-" prefix and ".scope" suffix
121+
id := strings.TrimPrefix(lastComponent, "cri-containerd-")
122+
id = strings.TrimSuffix(id, ".scope")
123+
if isValidContainerID(id) {
124+
return id
125+
}
126+
}
127+
128+
// Handle docker format: docker-<container_id>.scope
129+
if strings.HasPrefix(lastComponent, "docker-") && strings.HasSuffix(lastComponent, ".scope") {
130+
id := strings.TrimPrefix(lastComponent, "docker-")
131+
id = strings.TrimSuffix(id, ".scope")
132+
if isValidContainerID(id) {
133+
return id
134+
}
135+
}
136+
137+
// Handle plain container ID format (last path component is the container ID)
138+
// Example: /kubepods/besteffort/pod<uuid>/<container_id>
139+
if isValidContainerID(lastComponent) {
140+
return lastComponent
141+
}
142+
143+
return ""
144+
}
145+
146+
// isValidContainerID checks if a string looks like a valid container ID.
147+
// Container IDs are typically 64 hex characters.
148+
func isValidContainerID(id string) bool {
149+
if len(id) != 64 {
150+
return false
151+
}
152+
for _, c := range id {
153+
if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
154+
return false
155+
}
156+
}
157+
return true
158+
}
159+
160+
// getNsPid reads /proc/<pid>/status and extracts the NSpid (PID in container namespace).
161+
// The NSpid line contains space-separated PIDs for each namespace level.
162+
// Example: "NSpid: 12345 1" where 12345 is root ns PID and 1 is container ns PID.
163+
func getNsPid(pid string) (string, error) {
164+
statusPath := filepath.Join("/proc", pid, "status")
165+
file, err := os.Open(statusPath)
166+
if err != nil {
167+
return "", fmt.Errorf("failed to open %s: %w", statusPath, err)
168+
}
169+
defer file.Close()
170+
171+
scanner := bufio.NewScanner(file)
172+
for scanner.Scan() {
173+
line := scanner.Text()
174+
if strings.HasPrefix(line, "NSpid:") {
175+
// Remove "NSpid:" prefix and split by whitespace
176+
nspidPart := strings.TrimPrefix(line, "NSpid:")
177+
pids := strings.Fields(nspidPart)
178+
if len(pids) == 0 {
179+
return "", fmt.Errorf("NSpid line is empty in %s", statusPath)
180+
}
181+
// Return the last PID (innermost namespace)
182+
return pids[len(pids)-1], nil
183+
}
184+
}
185+
186+
if err := scanner.Err(); err != nil {
187+
return "", fmt.Errorf("failed to read %s: %w", statusPath, err)
188+
}
189+
190+
return "", fmt.Errorf("NSpid not found in %s", statusPath)
191+
}
192+
193+
// execClearRefs runs clear_refs inside the container via crictl exec.
194+
func execClearRefs(containerID, clearRefsBin, nspid string) error {
195+
// Build the crictl exec command
196+
cmd := exec.Command("crictl", "exec", containerID, clearRefsBin, nspid)
197+
198+
// Capture combined output for error reporting
199+
output, err := cmd.CombinedOutput()
200+
if err != nil {
201+
return fmt.Errorf("crictl exec failed: %w\nOutput: %s", err, string(output))
202+
}
203+
204+
if len(output) > 0 {
205+
fmt.Printf("crictl exec output: %s\n", string(output))
206+
}
207+
208+
return nil
209+
}

criu/cr-restore.c

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1687,6 +1687,14 @@ static int __restore_task_with_children(void *_arg)
16871687
goto err;
16881688
}
16891689

1690+
/*
1691+
* All children have finished premapping (forking barrier passed).
1692+
* Convert PROT_EXEC VMAs from anonymous to memfd-backed so the
1693+
* restorer's mprotect won't trigger SELinux execmem/execmod.
1694+
*/
1695+
if (finalize_exec_mappings(current))
1696+
goto err;
1697+
16901698
if (restore_one_task(vpid(current), ca->core))
16911699
goto err;
16921700

@@ -2517,7 +2525,7 @@ static unsigned long restorer_len;
25172525
static int prepare_restorer_blob(void)
25182526
{
25192527
/*
2520-
* We map anonymous mapping, not mremap the restorer itself later.
2528+
* We map anonymous memory, not mremap the restorer itself later.
25212529
* Otherwise the restorer vma would be tied to criu binary which
25222530
* in turn will lead to set-exe-file prctl to fail with EBUSY.
25232531
*/
@@ -2539,7 +2547,7 @@ static int prepare_restorer_blob(void)
25392547
*/
25402548
restorer_len = round_up(pbd.hdr.args_off, page_size());
25412549

2542-
restorer = mmap(NULL, restorer_len, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
2550+
restorer = mmap(NULL, restorer_len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
25432551
if (restorer == MAP_FAILED) {
25442552
pr_perror("Can't map restorer code");
25452553
return -1;
@@ -2554,6 +2562,8 @@ static int remap_restorer_blob(void *addr)
25542562
{
25552563
struct parasite_blob_desc pbd;
25562564
void *mem;
2565+
int fd;
2566+
ssize_t off, ret;
25572567

25582568
mem = mremap(restorer, restorer_len, restorer_len, MREMAP_FIXED | MREMAP_MAYMOVE, addr);
25592569
if (mem != addr) {
@@ -2570,6 +2580,49 @@ static int remap_restorer_blob(void *addr)
25702580
restorer_setup_c_header_desc(&pbd, true);
25712581
compel_relocs_apply(addr, addr, &pbd);
25722582

2583+
/*
2584+
* To avoid SELinux execmem denials, back the restorer blob with a
2585+
* memfd instead of anonymous memory. The kernel's
2586+
* file_map_prot_check() requires process:execmem when:
2587+
* (PROT_EXEC) && (!file || IS_PRIVATE(inode) || (!shared && PROT_WRITE))
2588+
*
2589+
* Using MAP_SHARED on a memfd (file-backed, !IS_PRIVATE) makes the
2590+
* !shared term false, so execmem is never checked. Instead SELinux
2591+
* checks file:execute (which runtime_t has on Bottlerocket).
2592+
*
2593+
* Each restored process calls remap_restorer_blob() independently
2594+
* with its own memfd, so MAP_SHARED pages are not shared across
2595+
* the process tree.
2596+
*/
2597+
fd = memfd_create("restorer", 0);
2598+
if (fd < 0) {
2599+
pr_perror("Can't create memfd for restorer");
2600+
return -1;
2601+
}
2602+
2603+
if (ftruncate(fd, restorer_len)) {
2604+
pr_perror("Can't resize restorer memfd");
2605+
goto err;
2606+
}
2607+
2608+
for (off = 0; off < restorer_len; ) {
2609+
ret = write(fd, addr + off, restorer_len - off);
2610+
if (ret <= 0) {
2611+
pr_perror("Can't write restorer blob to memfd");
2612+
goto err;
2613+
}
2614+
off += ret;
2615+
}
2616+
2617+
mem = mmap(addr, restorer_len, PROT_READ | PROT_WRITE | PROT_EXEC,
2618+
MAP_SHARED | MAP_FIXED, fd, 0);
2619+
if (mem != addr) {
2620+
pr_perror("Can't map restorer memfd");
2621+
goto err;
2622+
}
2623+
2624+
close(fd);
2625+
25732626
/*
25742627
* Ensure the infected thread sees the updated code.
25752628
*
@@ -2582,6 +2635,10 @@ static int remap_restorer_blob(void *addr)
25822635
__builtin___clear_cache(addr, addr + pbd.hdr.bsize);
25832636

25842637
return 0;
2638+
2639+
err:
2640+
close(fd);
2641+
return -1;
25852642
}
25862643

25872644
static int validate_sched_parm(struct rst_sched_param *sp)

criu/filesystems.c

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -417,9 +417,8 @@ static int tmpfs_dump(struct mount_info *pm)
417417
userns_pid = root_item->pid->real;
418418

419419
ret = cr_system_userns(fd, img_raw_fd(img), -1, "tar",
420-
(char *[]){ "tar", "--create", "--gzip", "--no-unquote", "--no-wildcards",
421-
"--one-file-system", "--check-links", "--preserve-permissions", "--sparse",
422-
"--numeric-owner", "--directory", "/proc/self/fd/0", ".", NULL },
420+
(char *[]){ "tar", "-czf", "-", "--numeric-owner",
421+
"-C", "/proc/self/fd/0", ".", NULL },
423422
0, userns_pid);
424423

425424
if (ret)
@@ -451,8 +450,7 @@ static int tmpfs_restore(struct mount_info *pm)
451450
}
452451

453452
ret = cr_system(img_raw_fd(img), -1, -1, "tar",
454-
(char *[]){ "tar", "--extract", "--gzip", "--no-unquote", "--no-wildcards", "--directory",
455-
service_mountpoint(pm), NULL },
453+
(char *[]){ "tar", "-xzf", "-", "-C", service_mountpoint(pm), NULL },
456454
0);
457455
close_image(img);
458456

0 commit comments

Comments
 (0)