Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions deploy/kubernetes/reaper-agent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
Expand Down Expand Up @@ -134,6 +137,7 @@ spec:
mountPath: /host/etc/reaper
- name: run-reaper
mountPath: /host/run/reaper
mountPropagation: HostToContainer
- name: usr-local-bin
mountPath: /host/usr/local/bin
readOnly: true
Expand Down
4 changes: 2 additions & 2 deletions docs/TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ List of tasks to do, not ordered in any specific way.
- [ ] Introduce more complex examples, answer this question: can we have a sssd containerd pod expose its socks file so a sample reaper pod can utilize it?
- [ ] Produce RPM an DEB packages compatible with major distributions (SUSE, RHEL, Debian, Ubuntu). This will help with installation and deployment.
- [x] Evaluate if Reaper can be configured using a Kubernetes ConfigMap instead of relying on a node-level config file. (Implemented via `reaper-agent` DaemonSet — PR #27)
- [ ] reaper-agent Phase 2: Overlay GC — reconcile overlay namespaces against Kubernetes API, delete overlays for namespaces that no longer exist
- [x] reaper-agent Phase 2: Overlay GC — reconcile overlay namespaces against Kubernetes API, delete overlays for namespaces that no longer exist
- [ ] reaper-agent Phase 2: Binary self-update — watch ConfigMap version field, download and replace shim/runtime binaries
- [ ] reaper-agent Phase 2: Node condition reporting — patch Node object with `ReaperReady` condition
- [ ] reaper-agent Phase 2: Mount namespace cleanup — detect and unmount stale `/run/reaper/ns/*` bind-mounts
- [x] reaper-agent Phase 2: Mount namespace cleanup — detect and unmount stale `/run/reaper/ns/*` bind-mounts
- [ ] Fix known bugs documented in [docs/BUGS.md](BUGS.md)
364 changes: 364 additions & 0 deletions scripts/lib/test-integration-suite.sh

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions src/bin/reaper-agent/gc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,22 @@ pub async fn run_gc(state_dir: &str, metrics: &MetricsState) {
let mut created = 0u64;
let mut cleaned = 0u64;

// Infrastructure directories that are NOT container state dirs — skip during GC
const INFRA_DIRS: &[&str] = &["overlay", "merged", "ns"];

for entry in entries.flatten() {
let path = entry.path();
if !path.is_dir() {
continue;
}

// Skip overlay infrastructure directories (managed by overlay GC, not container GC)
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if INFRA_DIRS.contains(&name) {
continue;
}
}

let state_file = path.join("state.json");
if !state_file.exists() {
// Directory with no state.json — orphaned, clean up
Expand Down
25 changes: 25 additions & 0 deletions src/bin/reaper-agent/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ mod config_sync;
mod gc;
mod health;
mod metrics;
mod overlay_gc;

// config.rs is available as shared module but not needed by the agent
// (the agent writes config files, it doesn't read them)
Expand Down Expand Up @@ -63,6 +64,14 @@ struct Cli {
#[arg(long, default_value = "60", env = "REAPER_AGENT_GC_INTERVAL")]
gc_interval: u64,

/// Overlay GC reconciliation interval in seconds
#[arg(long, default_value = "300", env = "REAPER_AGENT_OVERLAY_GC_INTERVAL")]
overlay_gc_interval: u64,

/// Enable overlay GC (reconcile overlay dirs against K8s namespaces)
#[arg(long, default_value = "true", env = "REAPER_AGENT_OVERLAY_GC_ENABLED")]
overlay_gc_enabled: bool,

/// Base state directory (via hostPath mount)
#[arg(
long,
Expand Down Expand Up @@ -146,6 +155,19 @@ async fn main() -> anyhow::Result<()> {
}
});

// Spawn overlay GC loop (reconcile overlay dirs against K8s namespaces)
let overlay_gc_handle = if cli.overlay_gc_enabled {
let ogc_state_dir = cli.state_dir.clone();
let ogc_metrics = metrics_state.clone();
let ogc_interval = cli.overlay_gc_interval;
Some(tokio::spawn(async move {
overlay_gc::overlay_gc_loop(&ogc_state_dir, ogc_interval, &ogc_metrics).await;
}))
} else {
info!("overlay GC disabled via --overlay-gc-enabled=false");
None
};

let server_metrics = metrics_state.clone();
let server_shim = cli.shim_path.clone();
let server_runtime = cli.runtime_path.clone();
Expand Down Expand Up @@ -175,6 +197,9 @@ async fn main() -> anyhow::Result<()> {
health_handle.abort();
sync_handle.abort();
server_handle.abort();
if let Some(h) = overlay_gc_handle {
h.abort();
}

Ok(())
}
68 changes: 68 additions & 0 deletions src/bin/reaper-agent/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ struct MetricsInner {

// Health gauge
healthy: Gauge,

// Overlay GC metrics
overlay_gc_runs_total: Counter,
overlay_gc_cleaned_total: Counter,
overlay_namespaces: Gauge,

// Namespace cleanup metrics
ns_cleanup_runs_total: Counter,
ns_cleaned_total: Counter,
}

impl MetricsState {
Expand All @@ -42,6 +51,11 @@ impl MetricsState {
let config_syncs_total = Counter::default();
let gc_runs_total = Counter::default();
let healthy = Gauge::default();
let overlay_gc_runs_total = Counter::default();
let overlay_gc_cleaned_total = Counter::default();
let overlay_namespaces = Gauge::default();
let ns_cleanup_runs_total = Counter::default();
let ns_cleaned_total = Counter::default();

registry.register(
"reaper_containers_created",
Expand Down Expand Up @@ -73,6 +87,31 @@ impl MetricsState {
"Whether the agent considers the node healthy (1=healthy, 0=unhealthy)",
healthy.clone(),
);
registry.register(
"reaper_agent_overlay_gc_runs_total",
"Total number of overlay GC reconciliation cycles",
overlay_gc_runs_total.clone(),
);
registry.register(
"reaper_agent_overlay_gc_cleaned_total",
"Total number of overlay namespaces cleaned up",
overlay_gc_cleaned_total.clone(),
);
registry.register(
"reaper_agent_overlay_namespaces",
"Current number of on-disk overlay namespace directories",
overlay_namespaces.clone(),
);
registry.register(
"reaper_agent_ns_cleanup_runs_total",
"Total number of mount namespace cleanup passes",
ns_cleanup_runs_total.clone(),
);
registry.register(
"reaper_agent_ns_cleaned_total",
"Total number of stale namespace bind-mount files removed",
ns_cleaned_total.clone(),
);

Self {
inner: Arc::new(MetricsInner {
Expand All @@ -83,6 +122,11 @@ impl MetricsState {
config_syncs_total,
gc_runs_total,
healthy,
overlay_gc_runs_total,
overlay_gc_cleaned_total,
overlay_namespaces,
ns_cleanup_runs_total,
ns_cleaned_total,
}),
}
}
Expand Down Expand Up @@ -110,6 +154,30 @@ impl MetricsState {
self.inner.healthy.get() == 1
}

pub fn inc_overlay_gc_runs(&self) {
self.inner.overlay_gc_runs_total.inc();
}

pub fn inc_overlay_gc_cleaned(&self, count: u64) {
for _ in 0..count {
self.inner.overlay_gc_cleaned_total.inc();
}
}

pub fn inc_ns_cleanup_runs(&self) {
self.inner.ns_cleanup_runs_total.inc();
}

pub fn inc_ns_cleaned(&self, count: u64) {
for _ in 0..count {
self.inner.ns_cleaned_total.inc();
}
}

pub fn set_overlay_namespaces(&self, count: u64) {
self.inner.overlay_namespaces.set(count as i64);
}

pub fn encode(&self) -> String {
let mut buf = String::new();
let registry = self.inner.registry.lock().unwrap();
Expand Down
Loading
Loading