ActiDoo
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cmd/clusterrebootd/main.go‎
Lines changed: 78 additions & 4 deletions b/‎cmd/clusterrebootd/main.go‎
Lines changed: 78 additions & 4 deletions
diff --git a/‎docs/OPERATIONS.md‎
Lines changed: 13 additions & 4 deletions b/‎docs/OPERATIONS.md‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎docs/STATE.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/STATE.md‎
Lines changed: 6 additions & 0 deletions
@@ -16,6 +16,8 @@ service that favours safety, explicit configuration, and verifiable supply-chain
   timing/exit-code data for diagnostics.
 - **Health gating** – executes an operator-supplied script twice (pre- and post-lock) with rich environment variables for
   node identity, cluster policies, maintenance windows, and optional metrics endpoints.
+- **Cluster-wide health coordination** – persists unhealthy node markers in etcd so peers refuse to reboot while any script is
+  reporting failure, keeps publishing each node's health even when no reboot is pending, applies configured cluster policy thresholds (minimum healthy counts, fractions, fallback protections) before allowing another reboot, and clears the block automatically once the node becomes healthy again.【F:pkg/clusterhealth/etcd.go†L18-L153】【F:pkg/orchestrator/runner.go†L321-L469】
 - **Distributed coordination** – etcd-backed mutex with annotated metadata (`node`, `pid`, `acquired_at`) so operators can
   inspect lock holders during incidents.
 - **Safeguards** – kill switch file, dry-run mode, deny/allow maintenance windows, a configurable cooldown between
 
@@ -17,6 +17,7 @@ import (
 	"syscall"
 	"time"
 
+	"github.com/clusterrebootd/clusterrebootd/pkg/clusterhealth"
 	"github.com/clusterrebootd/clusterrebootd/pkg/config"
 	"github.com/clusterrebootd/clusterrebootd/pkg/cooldown"
 	"github.com/clusterrebootd/clusterrebootd/pkg/detector"
@@ -141,7 +142,10 @@ func commandRunWithWriters(args []string, stdout, stderr io.Writer) int {
 	}
 	defer locker.Close()
 
-	var cooldownManager *cooldown.EtcdManager
+	var (
+		cooldownManager      *cooldown.EtcdManager
+		clusterHealthManager *clusterhealth.EtcdManager
+	)
 	if cfg.MinRebootIntervalSec > 0 {
 		cdMgr, cdErr := cooldown.NewEtcdManager(cooldown.EtcdManagerOptions{
 			Endpoints:   cfg.EtcdEndpoints,
@@ -159,6 +163,21 @@ func commandRunWithWriters(args []string, stdout, stderr io.Writer) int {
 		defer cooldownManager.Close()
 	}
 
+	chMgr, chErr := clusterhealth.NewEtcdManager(clusterhealth.EtcdManagerOptions{
+		Endpoints:   cfg.EtcdEndpoints,
+		DialTimeout: 5 * time.Second,
+		Namespace:   cfg.EtcdNamespace,
+		Prefix:      "cluster_health",
+		TLS:         tlsConfig,
+		NodeName:    cfg.NodeName,
+	})
+	if chErr != nil {
+		fmt.Fprintf(stderr, "failed to initialise cluster health manager: %v\n", chErr)
+		return exitRunError
+	}
+	clusterHealthManager = chMgr
+	defer clusterHealthManager.Close()
+
 	jsonLogger := observability.NewJSONLogger(stderr)
 	metricsCollector := observability.MetricsCollector(observability.NoopMetricsCollector{})
 	var (
@@ -214,6 +233,7 @@ func commandRunWithWriters(args []string, stdout, stderr io.Writer) int {
 	if cooldownManager != nil {
 		runnerOptions = append(runnerOptions, orchestrator.WithCooldownManager(cooldownManager))
 	}
+	runnerOptions = append(runnerOptions, orchestrator.WithClusterHealthManager(clusterHealthManager))
 	runner, err := orchestrator.NewRunner(cfg, engine, healthRunner, locker, runnerOptions...)
 	if err != nil {
 		fmt.Fprintf(stderr, "failed to initialise orchestrator: %v\n", err)
@@ -380,7 +400,10 @@ func commandStatusWithWriters(args []string, stdout, stderr io.Writer) int {
 		}
 		locker = etcdManager
 	}
-	var cooldownManager *cooldown.EtcdManager
+	var (
+		cooldownManager      *cooldown.EtcdManager
+		clusterHealthManager *clusterhealth.EtcdManager
+	)
 	if cfgCopy.MinRebootIntervalSec > 0 {
 		cdMgr, cdErr := cooldown.NewEtcdManager(cooldown.EtcdManagerOptions{
 			Endpoints:   cfgCopy.EtcdEndpoints,
@@ -403,13 +426,32 @@ func commandStatusWithWriters(args []string, stdout, stderr io.Writer) int {
 		defer cooldownManager.Close()
 	}
 
+	if !*skipLock {
+		chMgr, chErr := clusterhealth.NewEtcdManager(clusterhealth.EtcdManagerOptions{
+			Endpoints:   cfgCopy.EtcdEndpoints,
+			DialTimeout: 5 * time.Second,
+			Namespace:   cfgCopy.EtcdNamespace,
+			Prefix:      "cluster_health",
+			TLS:         tlsConfig,
+			NodeName:    cfgCopy.NodeName,
+		})
+		if chErr != nil {
+			fmt.Fprintf(stderr, "failed to initialise cluster health manager: %v\n", chErr)
+			return exitRunError
+		}
+		clusterHealthManager = chMgr
+		defer clusterHealthManager.Close()
+	}
+
 	runnerOptions := []orchestrator.Option{orchestrator.WithMaxLockAttempts(1), orchestrator.WithCommandEnvironment(baseEnv)}
 	if *skipLock {
 		runnerOptions = append(runnerOptions, orchestrator.WithLockAcquisition(false, "lock acquisition skipped (--skip-lock)"))
 	}
 	if cooldownManager != nil {
 		runnerOptions = append(runnerOptions, orchestrator.WithCooldownManager(cooldownManager))
 	}
+	runnerOptions = append(runnerOptions, orchestrator.WithClusterHealthManager(clusterHealthManager))
+	runnerOptions = append(runnerOptions, orchestrator.WithClusterHealthReporting(false))
 
 	runner, err := orchestrator.NewRunner(&cfgCopy, engine, healthRunner, locker, runnerOptions...)
 	if err != nil {
@@ -585,11 +627,19 @@ func writeHealthResult(w io.Writer, label string, res *health.Result) {
 	}
 }
 
+func healthLabel(phase, fallback string) string {
+	phase = strings.TrimSpace(phase)
+	if phase == "" {
+		phase = fallback
+	}
+	return fmt.Sprintf("%s health", phase)
+}
+
 func reportOutcome(stdout io.Writer, outcome orchestrator.Outcome) {
 	fmt.Fprintln(stdout, "pre-lock detector evaluations:")
 	writeDetectorResults(stdout, outcome.DetectorResults)
 	if outcome.PreLockHealthResult != nil {
-		writeHealthResult(stdout, "pre-lock health", outcome.PreLockHealthResult)
+		writeHealthResult(stdout, healthLabel(outcome.PreLockHealthPhase, "pre-lock"), outcome.PreLockHealthResult)
 	}
 	if outcome.LockAcquired {
 		fmt.Fprintln(stdout, "lock acquired")
@@ -599,7 +649,11 @@ func reportOutcome(stdout io.Writer, outcome orchestrator.Outcome) {
 		writeDetectorResults(stdout, outcome.PostLockDetectorResults)
 	}
 	if outcome.PostLockHealthResult != nil {
-		writeHealthResult(stdout, "post-lock health", outcome.PostLockHealthResult)
+		writeHealthResult(stdout, healthLabel(outcome.PostLockHealthPhase, "post-lock"), outcome.PostLockHealthResult)
+	}
+	if len(outcome.ClusterUnhealthy) > 0 {
+		fmt.Fprintln(stdout, "cluster health blockers:")
+		writeClusterHealthRecords(stdout, outcome.ClusterUnhealthy)
 	}
 	fmt.Fprintf(stdout, "outcome: %s - %s\n", outcome.Status, outcome.Message)
 	if len(outcome.Command) > 0 {
@@ -610,6 +664,26 @@ func reportOutcome(stdout io.Writer, outcome orchestrator.Outcome) {
 	}
 }
 
+func writeClusterHealthRecords(w io.Writer, records []clusterhealth.Record) {
+	for _, rec := range records {
+		details := []string{}
+		if rec.Stage != "" {
+			details = append(details, fmt.Sprintf("stage=%s", rec.Stage))
+		}
+		if rec.Reason != "" {
+			details = append(details, fmt.Sprintf("reason=%s", rec.Reason))
+		}
+		if !rec.ReportedAt.IsZero() {
+			details = append(details, fmt.Sprintf("reported_at=%s", rec.ReportedAt.UTC().Format(time.RFC3339Nano)))
+		}
+		if len(details) > 0 {
+			fmt.Fprintf(w, "  - %s (%s)\n", rec.Node, strings.Join(details, ", "))
+		} else {
+			fmt.Fprintf(w, "  - %s\n", rec.Node)
+		}
+	}
+}
+
 func exitCodeForOutcome(out orchestrator.Outcome) int {
 	switch out.Status {
 	case orchestrator.OutcomeKillSwitch:
 
@@ -62,10 +62,12 @@ it for your environment, and run the daemon with `clusterrebootd run
    the interval in etcd and refuses new reboot attempts until the window expires,
    preventing back-to-back maintenance events.【F:examples/config.yaml†L23-L30】【F:cmd/clusterrebootd/main.go†L233-L272】【F:pkg/orchestrator/runner.go†L311-L376】
 5. **Set cluster policies and maintenance windows** – `cluster_policies`
-   expresses minimum healthy nodes and fallback protections.  Maintenance windows
-   allow operators to block or explicitly permit reboots using cron-like day/time
-   ranges; deny rules always win, while allow rules opt the coordinator into the
-   listed windows.【F:examples/config.yaml†L85-L112】【F:pkg/windows/windows.go†L1-L123】
+   expresses minimum healthy nodes and fallback protections, which the
+   orchestrator enforces automatically by evaluating cluster-wide health records
+   before each reboot attempt.  Maintenance windows allow operators to block or
+   explicitly permit reboots using cron-like day/time ranges; deny rules always
+   win, while allow rules opt the coordinator into the listed
+   windows.【F:examples/config.yaml†L85-L112】【F:pkg/windows/windows.go†L1-L123】【F:pkg/orchestrator/runner.go†L321-L469】
 6. **Wire observability and safety toggles** – Define `kill_switch_file` so a
    single touch blocks reboots, and enable the Prometheus listener via
    `metrics.enabled`/`metrics.listen` when metrics are required.【F:examples/config.yaml†L41-L47】【F:examples/config.yaml†L114-L118】【F:cmd/clusterrebootd/main.go†L193-L252】
@@ -98,6 +100,13 @@ Health scripts are the final safeguard before a reboot.  Follow these practices:
   Diagnostics invoked with `status --skip-health` or `--skip-lock` set
   `RC_SKIP_HEALTH`/`RC_SKIP_LOCK` to `true`, allowing scripts to short-circuit
   optional checks when operators intentionally bypass them.【F:cmd/clusterrebootd/main.go†L298-L305】【F:pkg/orchestrator/runner.go†L485-L500】
+  - **Expect global gating on failure** – The coordinator now stores an unhealthy
+    marker in etcd whenever the script exits non-zero, runs the script even when
+    no reboot is pending so the cluster view stays current, and evaluates the
+    configured cluster policy thresholds before allowing another reboot.  Peers
+    block their own reboots until a later pass succeeds and clears the entry.
+    Use the `status` command with health checks enabled to verify the marker
+    clears after remediation.【F:pkg/clusterhealth/etcd.go†L18-L153】【F:pkg/orchestrator/runner.go†L321-L469】
 - **Return meaningful exit codes** – Exit `0` to allow the reboot, non-zero to
   block it.  Write concise status details to stdout/stderr; they are captured in
   the JSON logs and CLI output for incident response.【F:cmd/clusterrebootd/main.go†L482-L517】
 
@@ -42,6 +42,12 @@
 - The health script base environment now includes cluster policy thresholds,
   fallback node lists, and configured maintenance windows so gating logic can
   enforce operator intent without re-reading the configuration file.
+- Cluster health coordination now records unhealthy nodes in etcd so any peer
+  that detects a reboot requirement blocks until the failing node reports a
+  healthy script outcome again, and the daemon runs the gate script even when no
+  reboot is pending so the cluster view stays accurate while applying the
+  configured cluster policy thresholds to prevent cascading outages when the
+  cluster is already degraded.【F:pkg/clusterhealth/etcd.go†L18-L153】【F:pkg/orchestrator/runner.go†L321-L469】
 - Reboot command execution now expands the same environment placeholders (e.g.
   `RC_NODE_NAME`) so the logged and invoked command reflects the active node
   context without depending on shell-specific substitution.