Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,235 changes: 1,221 additions & 14 deletions Cargo.lock

Large diffs are not rendered by default.

17 changes: 16 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ license = "MIT"
[dependencies]
anyhow = "1"
thiserror = "1"
clap = { version = "4", features = ["derive"] }
clap = { version = "4", features = ["derive", "env"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
nix = { version = "0.28", features = ["signal", "process", "user", "sched", "mount", "fs", "term"] }
Expand All @@ -23,6 +23,16 @@ protobuf = "3.3"
containerd-shim = { version = "0.10", features = ["async", "tracing"] }
containerd-shim-protos = { version = "0.10", features = ["async"] }

# reaper-agent dependencies
kube = { version = "0.98", features = ["runtime", "client", "derive"], optional = true }
k8s-openapi = { version = "0.24", features = ["v1_31"], optional = true }
prometheus-client = { version = "0.23", optional = true }
axum = { version = "0.8", optional = true }
futures = { version = "0.3", optional = true }

[features]
agent = ["kube", "k8s-openapi", "prometheus-client", "axum", "futures"]

[dev-dependencies]
tempfile = "3"
serial_test = "3"
Expand All @@ -35,6 +45,11 @@ path = "src/bin/reaper-runtime/main.rs"
name = "containerd-shim-reaper-v2"
path = "src/bin/containerd-shim-reaper-v2/main.rs"

[[bin]]
name = "reaper-agent"
path = "src/bin/reaper-agent/main.rs"
required-features = ["agent"]

[lints.rust]
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }

Expand Down
27 changes: 27 additions & 0 deletions Dockerfile.agent
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Multi-stage build for reaper-agent
# Produces a minimal static binary for Kubernetes DaemonSet deployment.

# --- Builder stage ---
FROM messense/rust-musl-cross:x86_64-musl AS builder-amd64
WORKDIR /work
COPY . .
RUN cargo build --release --features agent --bin reaper-agent --target x86_64-unknown-linux-musl

FROM messense/rust-musl-cross:aarch64-musl AS builder-arm64
WORKDIR /work
COPY . .
RUN cargo build --release --features agent --bin reaper-agent --target aarch64-unknown-linux-musl

# --- Runtime stage ---
# Use distroless for a minimal image with ca-certificates (needed for K8s API TLS)
FROM gcr.io/distroless/static-debian12

ARG TARGETARCH
COPY --from=builder-amd64 /work/target/x86_64-unknown-linux-musl/release/reaper-agent /reaper-agent-amd64
COPY --from=builder-arm64 /work/target/aarch64-unknown-linux-musl/release/reaper-agent /reaper-agent-arm64

# Select binary based on target architecture
# Note: For single-arch builds, use docker buildx with --platform
COPY --from=builder-${TARGETARCH}64 /work/target/*/release/reaper-agent /reaper-agent

ENTRYPOINT ["/reaper-agent"]
1 change: 1 addition & 0 deletions deploy/ansible/install-reaper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -237,5 +237,6 @@
- "1. Create RuntimeClass: kubectl apply -f deploy/kubernetes/runtimeclass.yaml"
- "2. Deploy test pod: kubectl apply -f deploy/kubernetes/runtimeclass.yaml"
- "3. Verify: kubectl logs reaper-example"
- "4. Optional: kubectl apply -f deploy/kubernetes/reaper-agent.yaml (config sync, GC, metrics)"
- ""
- "To rollback: ansible-playbook -i inventory.ini deploy/ansible/rollback-reaper.yml"
152 changes: 152 additions & 0 deletions deploy/kubernetes/reaper-agent.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
---
apiVersion: v1
kind: Namespace
metadata:
name: reaper-system
labels:
app.kubernetes.io/part-of: reaper
---
apiVersion: v1
kind: ConfigMap
metadata:
name: reaper-config
namespace: reaper-system
labels:
app.kubernetes.io/part-of: reaper
app.kubernetes.io/component: config
data:
reaper.conf: |
# Reaper runtime configuration
# Managed by reaper-agent ConfigMap sync.
# Edit this ConfigMap to change Reaper settings on all nodes.
REAPER_DNS_MODE=host
REAPER_OVERLAY_ISOLATION=namespace
REAPER_ANNOTATIONS_ENABLED=true
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: reaper-agent
namespace: reaper-system
labels:
app.kubernetes.io/part-of: reaper
app.kubernetes.io/component: agent
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: reaper-agent
labels:
app.kubernetes.io/part-of: reaper
app.kubernetes.io/component: agent
rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get", "watch", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: reaper-agent
labels:
app.kubernetes.io/part-of: reaper
app.kubernetes.io/component: agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: reaper-agent
subjects:
- kind: ServiceAccount
name: reaper-agent
namespace: reaper-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: reaper-agent
namespace: reaper-system
labels:
app.kubernetes.io/name: reaper-agent
app.kubernetes.io/part-of: reaper
app.kubernetes.io/component: agent
spec:
selector:
matchLabels:
app.kubernetes.io/name: reaper-agent
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app.kubernetes.io/name: reaper-agent
app.kubernetes.io/part-of: reaper
app.kubernetes.io/component: agent
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
prometheus.io/path: "/metrics"
spec:
serviceAccountName: reaper-agent
hostPID: true
tolerations:
- operator: Exists
effect: NoSchedule
containers:
- name: agent
image: ghcr.io/miguelgila/reaper-agent:latest
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 0
args:
- --config-namespace=reaper-system
- --config-name=reaper-config
- --config-path=/host/etc/reaper/reaper.conf
- --state-dir=/host/run/reaper
- --shim-path=/host/usr/local/bin/containerd-shim-reaper-v2
- --runtime-path=/host/usr/local/bin/reaper-runtime
ports:
- containerPort: 9100
name: metrics
protocol: TCP
livenessProbe:
httpGet:
path: /healthz
port: metrics
initialDelaySeconds: 10
periodSeconds: 30
readinessProbe:
httpGet:
path: /readyz
port: metrics
initialDelaySeconds: 5
periodSeconds: 10
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
volumeMounts:
- name: etc-reaper
mountPath: /host/etc/reaper
- name: run-reaper
mountPath: /host/run/reaper
- name: usr-local-bin
mountPath: /host/usr/local/bin
readOnly: true
volumes:
- name: etc-reaper
hostPath:
path: /etc/reaper
type: DirectoryOrCreate
- name: run-reaper
hostPath:
path: /run/reaper
type: DirectoryOrCreate
- name: usr-local-bin
hostPath:
path: /usr/local/bin
type: Directory
43 changes: 43 additions & 0 deletions docs/BUGS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Known Bugs and Flaky Tests

## DNS Mode Annotation Override Test Flake

**Test:** `DNS mode annotation override (host vs kubernetes)`
**Severity:** Low (intermittent, CI-only)
**Status:** Open

### Symptoms

The test times out (64s) waiting for the `reaper-dns-annot-default` or
`reaper-dns-annot-host` pod to reach `Succeeded` phase. The pod gets stuck
and containerd reports:

```
failed to stop sandbox: task must be stopped before deletion: running: failed precondition
```

### Root Cause

A timing race in containerd's sandbox lifecycle. When the shim reports the
container has exited, containerd sometimes tries to delete the task before
it has fully transitioned out of the `running` state. This causes a
`failed precondition` error that prevents sandbox teardown, leaving the pod
stuck.

This is a containerd-level issue, not a Reaper bug. It tends to surface
under load (e.g., when many pods are created/deleted in quick succession
during the integration test suite).

### Workarounds

- Re-running the test suite usually passes on retry.
- The `--agent-only` flag skips this test entirely for fast agent iteration.
- Running with `--no-cleanup` and re-running `--skip-cargo --no-cleanup`
often avoids the race since the cluster is warmer.

### Related

- Observed in Kind clusters with containerd v1.7+.
- The `Combined annotations` test exercises similar annotation logic and
passes reliably, suggesting the issue is timing-related rather than
functional.
7 changes: 6 additions & 1 deletion docs/TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,9 @@ List of tasks to do, not ordered in any specific way.
- [x] Add certain configuration parameters as annotations, so users can influence how Reaper works (DNS, overlay name and mount point, etc.). But ensuring adminsistrator parameters cannot be overriden.
- [ ] Introduce more complex examples, answer this question: can we have a sssd containerd pod expose its socks file so a sample reaper pod can utilize it?
- [ ] Produce RPM an DEB packages compatible with major distributions (SUSE, RHEL, Debian, Ubuntu). This will help with installation and deployment.
- [ ] Evaluate if Reaper can be configured using a Kubernetes ConfigMap instead of relying on a node-level config file.
- [x] Evaluate if Reaper can be configured using a Kubernetes ConfigMap instead of relying on a node-level config file. (Implemented via `reaper-agent` DaemonSet — PR #27)
- [ ] reaper-agent Phase 2: Overlay GC — reconcile overlay namespaces against Kubernetes API, delete overlays for namespaces that no longer exist
- [ ] reaper-agent Phase 2: Binary self-update — watch ConfigMap version field, download and replace shim/runtime binaries
- [ ] reaper-agent Phase 2: Node condition reporting — patch Node object with `ReaperReady` condition
- [ ] reaper-agent Phase 2: Mount namespace cleanup — detect and unmount stale `/run/reaper/ns/*` bind-mounts
- [ ] Fix known bugs documented in [docs/BUGS.md](BUGS.md)
Loading