Skip to content

Commit cde6663

Browse files
authored
Merge pull request #41 from ActiDoo/codex/fix-variable-substitution-for-rc_node_name
Expand reboot command environment placeholders
2 parents 927d161 + 8a54076 commit cde6663

File tree

7 files changed

+136
-48
lines changed

7 files changed

+136
-48
lines changed

cmd/clusterrebootd/main.go

Lines changed: 7 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212
"net/http"
1313
"os"
1414
"os/signal"
15-
"strconv"
1615
"strings"
1716
"syscall"
1817
"time"
@@ -116,7 +115,7 @@ func commandRunWithWriters(args []string, stdout, stderr io.Writer) int {
116115
return exitConfigError
117116
}
118117

119-
baseEnv := buildBaseEnvironment(cfg)
118+
baseEnv := cfg.BaseEnvironment()
120119

121120
tlsConfig, err := buildEtcdTLSConfig(cfg.EtcdTLS)
122121
if err != nil {
@@ -190,7 +189,10 @@ func commandRunWithWriters(args []string, stdout, stderr io.Writer) int {
190189

191190
reporter := orchestrator.NewStructuredReporter(cfg.NodeName, jsonLogger, metricsCollector)
192191

193-
runner, err := orchestrator.NewRunner(cfg, engine, healthRunner, locker, orchestrator.WithReporter(reporter))
192+
runner, err := orchestrator.NewRunner(cfg, engine, healthRunner, locker,
193+
orchestrator.WithReporter(reporter),
194+
orchestrator.WithCommandEnvironment(baseEnv),
195+
)
194196
if err != nil {
195197
fmt.Fprintf(stderr, "failed to initialise orchestrator: %v\n", err)
196198
return exitRunError
@@ -293,7 +295,7 @@ func commandStatusWithWriters(args []string, stdout, stderr io.Writer) int {
293295
return exitConfigError
294296
}
295297

296-
baseEnv := buildBaseEnvironment(&cfgCopy)
298+
baseEnv := cfgCopy.BaseEnvironment()
297299
if *skipHealth {
298300
baseEnv["RC_SKIP_HEALTH"] = "true"
299301
}
@@ -349,7 +351,7 @@ func commandStatusWithWriters(args []string, stdout, stderr io.Writer) int {
349351
defer etcdManager.Close()
350352
}
351353

352-
runnerOptions := []orchestrator.Option{orchestrator.WithMaxLockAttempts(1)}
354+
runnerOptions := []orchestrator.Option{orchestrator.WithMaxLockAttempts(1), orchestrator.WithCommandEnvironment(baseEnv)}
353355
if *skipLock {
354356
runnerOptions = append(runnerOptions, orchestrator.WithLockAcquisition(false, "lock acquisition skipped (--skip-lock)"))
355357
}
@@ -378,39 +380,6 @@ func commandStatusWithWriters(args []string, stdout, stderr io.Writer) int {
378380
return exitCodeForOutcome(outcome)
379381
}
380382

381-
func buildBaseEnvironment(cfg *config.Config) map[string]string {
382-
env := map[string]string{
383-
"RC_NODE_NAME": cfg.NodeName,
384-
"RC_DRY_RUN": strconv.FormatBool(cfg.DryRun),
385-
}
386-
if cfg.LockKey != "" {
387-
env["RC_LOCK_KEY"] = cfg.LockKey
388-
}
389-
if len(cfg.EtcdEndpoints) > 0 {
390-
env["RC_ETCD_ENDPOINTS"] = strings.Join(cfg.EtcdEndpoints, ",")
391-
}
392-
if cfg.KillSwitchFile != "" {
393-
env["RC_KILL_SWITCH_FILE"] = cfg.KillSwitchFile
394-
}
395-
if cfg.ClusterPolicies.MinHealthyFraction != nil {
396-
env["RC_CLUSTER_MIN_HEALTHY_FRACTION"] = strconv.FormatFloat(*cfg.ClusterPolicies.MinHealthyFraction, 'f', -1, 64)
397-
}
398-
if cfg.ClusterPolicies.MinHealthyAbsolute != nil {
399-
env["RC_CLUSTER_MIN_HEALTHY_ABSOLUTE"] = strconv.Itoa(*cfg.ClusterPolicies.MinHealthyAbsolute)
400-
}
401-
env["RC_CLUSTER_FORBID_IF_ONLY_FALLBACK_LEFT"] = strconv.FormatBool(cfg.ClusterPolicies.ForbidIfOnlyFallbackLeft)
402-
if len(cfg.ClusterPolicies.FallbackNodes) > 0 {
403-
env["RC_CLUSTER_FALLBACK_NODES"] = strings.Join(cfg.ClusterPolicies.FallbackNodes, ",")
404-
}
405-
if len(cfg.Windows.Allow) > 0 {
406-
env["RC_WINDOWS_ALLOW"] = strings.Join(cfg.Windows.Allow, ",")
407-
}
408-
if len(cfg.Windows.Deny) > 0 {
409-
env["RC_WINDOWS_DENY"] = strings.Join(cfg.Windows.Deny, ",")
410-
}
411-
return env
412-
}
413-
414383
func commandValidate(args []string) int {
415384
return commandValidateWithWriters(args, os.Stdout, os.Stderr)
416385
}

cmd/clusterrebootd/main_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,7 @@ func TestBuildBaseEnvironmentIncludesPolicyContext(t *testing.T) {
732732
},
733733
}
734734

735-
env := buildBaseEnvironment(cfg)
735+
env := cfg.BaseEnvironment()
736736

737737
if got := env["RC_NODE_NAME"]; got != cfg.NodeName {
738738
t.Fatalf("expected RC_NODE_NAME %q, got %q", cfg.NodeName, got)
@@ -766,7 +766,7 @@ func TestBuildBaseEnvironmentIncludesPolicyContext(t *testing.T) {
766766
func TestBuildBaseEnvironmentOmitsUnsetPolicyContext(t *testing.T) {
767767
cfg := &config.Config{}
768768

769-
env := buildBaseEnvironment(cfg)
769+
env := cfg.BaseEnvironment()
770770

771771
if _, ok := env["RC_CLUSTER_MIN_HEALTHY_FRACTION"]; ok {
772772
t.Fatalf("expected RC_CLUSTER_MIN_HEALTHY_FRACTION to be absent")

docs/OPERATIONS.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ it for your environment, and run the daemon with `clusterrebootd run
5252
2. **Define the health script** – Point `health_script` at an absolute path and
5353
configure `health_timeout_sec` so the runner can cancel long-running checks.
5454
The orchestration loop executes the script before and after lock acquisition,
55-
injecting context that indicates the current phase and lock state.【F:examples/config.yaml†L39-L55】【F:cmd/clusterrebootd/main.go†L384-L409
55+
injecting context that indicates the current phase and lock state.【F:examples/config.yaml†L39-L55】【F:pkg/orchestrator/runner.go†L485-L500
5656
3. **Configure the distributed lock** – Supply at least one etcd endpoint,
5757
namespace, and `lock_key`; ensure `lock_ttl_sec` exceeds the health timeout so
5858
the lease outlives the slowest permissible health check. Enable mutual TLS by
@@ -87,13 +87,13 @@ Health scripts are the final safeguard before a reboot. Follow these practices:
8787
- **Use the injected environment** – The coordinator exports static context such
8888
as `RC_NODE_NAME`, `RC_DRY_RUN`, `RC_LOCK_KEY`, etcd endpoints, kill switch
8989
location, cluster policy thresholds, fallback node list, and maintenance
90-
windows so scripts do not need to re-read the YAML file.【F:cmd/clusterrebootd/main.go†L381-L409
90+
windows so scripts do not need to re-read the YAML file.【F:pkg/config/config.go†L230-L263
9191
- **React to runtime hints** – Each invocation adds `RC_PHASE` (`pre-lock` or
9292
`post-lock`), `RC_LOCK_ENABLED`, `RC_LOCK_HELD`, and `RC_LOCK_ATTEMPTS` so
9393
scripts can distinguish dry runs, skipped locks, and contention scenarios.
9494
Diagnostics invoked with `status --skip-health` or `--skip-lock` set
9595
`RC_SKIP_HEALTH`/`RC_SKIP_LOCK` to `true`, allowing scripts to short-circuit
96-
optional checks when operators intentionally bypass them.【F:cmd/clusterrebootd/main.go†L293-L322】【F:pkg/orchestrator/runner.go†L430-L466
96+
optional checks when operators intentionally bypass them.【F:cmd/clusterrebootd/main.go†L298-L305】【F:pkg/orchestrator/runner.go†L485-L500
9797
- **Return meaningful exit codes** – Exit `0` to allow the reboot, non-zero to
9898
block it. Write concise status details to stdout/stderr; they are captured in
9999
the JSON logs and CLI output for incident response.【F:cmd/clusterrebootd/main.go†L482-L517】
@@ -164,7 +164,7 @@ backoff, aligning with the orchestrator's internal retry logic.【F:docs/PACKAGI
164164
| CLI exits with code `2` and `invalid configuration` | Schema or semantic error (e.g. missing node name, TTL too small) | Run `validate-config` and fix the listed problems; the loader aggregates all validation failures to minimise iterations.【F:pkg/config/config.go†L90-L171】 |
165165
| `status` reports `health_blocked` with non-zero exit codes | Health script failed or timed out | Review stdout/stderr in the command output, inspect the script logs, and adjust cluster policy checks or timeouts as needed.【F:cmd/clusterrebootd/main.go†L482-L517】 |
166166
| `status` reports `lock_unavailable` | etcd unreachable or contended | Confirm network reachability, validate TLS credentials, and inspect the lock key metadata (node, PID, timestamp) to identify the current holder before retrying.【F:cmd/clusterrebootd/main.go†L193-L252】【F:pkg/orchestrator/runner.go†L133-L211】 |
167-
| Orchestration skipped with `window_denied`/`window_outside_allow` | Current time falls inside a deny window or outside all allow windows | Adjust the `windows` expressions or wait for the next permitted slot; the decision is also exported to the health script via maintenance window environment variables.【F:pkg/windows/windows.go†L29-L123】【F:cmd/clusterrebootd/main.go†L381-L409|
167+
| Orchestration skipped with `window_denied`/`window_outside_allow` | Current time falls inside a deny window or outside all allow windows | Adjust the `windows` expressions or wait for the next permitted slot; the decision is also exported to the health script via maintenance window environment variables.【F:pkg/windows/windows.go†L29-L123】【F:pkg/config/config.go†L230-L263|
168168
| Metrics server fails to start | Address already in use or invalid listen string | Update `metrics.listen` to a free address/port combination and restart the daemon; the listener prints an error during startup when binding fails.【F:cmd/clusterrebootd/main.go†L193-L252】 |
169169

170170
## Additional References

docs/STATE.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535
- The health script base environment now includes cluster policy thresholds,
3636
fallback node lists, and configured maintenance windows so gating logic can
3737
enforce operator intent without re-reading the configuration file.
38+
- Reboot command execution now expands the same environment placeholders (e.g.
39+
`RC_NODE_NAME`) so the logged and invoked command reflects the active node
40+
context without depending on shell-specific substitution.
3841
- Health script executions are annotated with runtime lock context via
3942
`RC_PHASE`, `RC_LOCK_ENABLED`, `RC_LOCK_HELD`, and `RC_LOCK_ATTEMPTS`, giving
4043
scripts enough detail to reason about contention and ensure post-lock checks

pkg/config/config.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"io"
77
"os"
8+
"strconv"
89
"strings"
910
"time"
1011

@@ -226,6 +227,42 @@ func (c *Config) applyDefaults() {
226227
}
227228
}
228229

230+
// BaseEnvironment returns the static environment variables derived from the configuration.
231+
// The resulting map can be extended with runtime annotations (metrics endpoints, skip flags)
232+
// before injecting it into health scripts or reboot command expansion.
233+
func (c *Config) BaseEnvironment() map[string]string {
234+
env := map[string]string{
235+
"RC_NODE_NAME": c.NodeName,
236+
"RC_DRY_RUN": strconv.FormatBool(c.DryRun),
237+
}
238+
if strings.TrimSpace(c.LockKey) != "" {
239+
env["RC_LOCK_KEY"] = c.LockKey
240+
}
241+
if len(c.EtcdEndpoints) > 0 {
242+
env["RC_ETCD_ENDPOINTS"] = strings.Join(c.EtcdEndpoints, ",")
243+
}
244+
if strings.TrimSpace(c.KillSwitchFile) != "" {
245+
env["RC_KILL_SWITCH_FILE"] = c.KillSwitchFile
246+
}
247+
if c.ClusterPolicies.MinHealthyFraction != nil {
248+
env["RC_CLUSTER_MIN_HEALTHY_FRACTION"] = strconv.FormatFloat(*c.ClusterPolicies.MinHealthyFraction, 'f', -1, 64)
249+
}
250+
if c.ClusterPolicies.MinHealthyAbsolute != nil {
251+
env["RC_CLUSTER_MIN_HEALTHY_ABSOLUTE"] = strconv.Itoa(*c.ClusterPolicies.MinHealthyAbsolute)
252+
}
253+
env["RC_CLUSTER_FORBID_IF_ONLY_FALLBACK_LEFT"] = strconv.FormatBool(c.ClusterPolicies.ForbidIfOnlyFallbackLeft)
254+
if len(c.ClusterPolicies.FallbackNodes) > 0 {
255+
env["RC_CLUSTER_FALLBACK_NODES"] = strings.Join(c.ClusterPolicies.FallbackNodes, ",")
256+
}
257+
if len(c.Windows.Allow) > 0 {
258+
env["RC_WINDOWS_ALLOW"] = strings.Join(c.Windows.Allow, ",")
259+
}
260+
if len(c.Windows.Deny) > 0 {
261+
env["RC_WINDOWS_DENY"] = strings.Join(c.Windows.Deny, ",")
262+
}
263+
return env
264+
}
265+
229266
func (d *DetectorConfig) applyDefaults(index int) {
230267
if strings.TrimSpace(d.Name) != "" {
231268
return

pkg/orchestrator/runner.go

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ type Runner struct {
7272
lockEnabled bool
7373
lockSkipReason string
7474
now func() time.Time
75+
commandEnv map[string]string
7576
}
7677

7778
// Option configures a Runner.
@@ -130,6 +131,14 @@ func WithLockAcquisition(enabled bool, reason string) Option {
130131
}
131132
}
132133

134+
// WithCommandEnvironment overrides the environment used to expand reboot command arguments.
135+
// The provided map is copied to avoid accidental mutations by callers.
136+
func WithCommandEnvironment(env map[string]string) Option {
137+
return func(r *Runner) {
138+
r.commandEnv = cloneEnv(env)
139+
}
140+
}
141+
133142
// WithTimeSource injects a custom time source, enabling deterministic tests.
134143
func WithTimeSource(fn func() time.Time) Option {
135144
return func(r *Runner) {
@@ -166,6 +175,7 @@ func NewRunner(cfg *config.Config, detectors DetectorEvaluator, healthRunner Hea
166175
maxLockTries: 5,
167176
reporter: NoopReporter{},
168177
lockEnabled: true,
178+
commandEnv: cfg.BaseEnvironment(),
169179
}
170180

171181
for _, opt := range opts {
@@ -190,6 +200,9 @@ func NewRunner(cfg *config.Config, detectors DetectorEvaluator, healthRunner Hea
190200
if runner.now == nil {
191201
runner.now = time.Now
192202
}
203+
if runner.commandEnv == nil {
204+
runner.commandEnv = cfg.BaseEnvironment()
205+
}
193206

194207
windowsEval, err := windows.NewEvaluator(cfg.Windows.Allow, cfg.Windows.Deny)
195208
if err != nil {
@@ -268,9 +281,7 @@ func (r *Runner) RunOnce(ctx context.Context) (out Outcome, err error) {
268281
}
269282
out.Message = msg
270283
out.DryRun = r.cfg.DryRun
271-
if len(r.cfg.RebootCommand) > 0 {
272-
out.Command = append([]string(nil), r.cfg.RebootCommand...)
273-
}
284+
out.Command = r.expandCommand(r.cfg.RebootCommand)
274285
return out, nil
275286
}
276287

@@ -323,11 +334,47 @@ func (r *Runner) RunOnce(ctx context.Context) (out Outcome, err error) {
323334
out.Status = OutcomeReady
324335
out.Message = "reboot prerequisites satisfied"
325336
out.DryRun = r.cfg.DryRun
326-
out.Command = append([]string(nil), r.cfg.RebootCommand...)
337+
out.Command = r.expandCommand(r.cfg.RebootCommand)
327338

328339
return out, nil
329340
}
330341

342+
func cloneEnv(src map[string]string) map[string]string {
343+
if len(src) == 0 {
344+
return nil
345+
}
346+
dst := make(map[string]string, len(src))
347+
for k, v := range src {
348+
dst[k] = v
349+
}
350+
return dst
351+
}
352+
353+
func (r *Runner) expandCommand(command []string) []string {
354+
if len(command) == 0 {
355+
return nil
356+
}
357+
expanded := make([]string, len(command))
358+
for i, arg := range command {
359+
expanded[i] = expandWithEnv(arg, r.commandEnv)
360+
}
361+
return expanded
362+
}
363+
364+
func expandWithEnv(input string, env map[string]string) string {
365+
if !strings.Contains(input, "$") {
366+
return input
367+
}
368+
return os.Expand(input, func(key string) string {
369+
if env != nil {
370+
if val, ok := env[key]; ok {
371+
return val
372+
}
373+
}
374+
return os.Getenv(key)
375+
})
376+
}
377+
331378
func (r *Runner) acquireLock(ctx context.Context) (lock.Lease, bool, int, error) {
332379
minBackoff, maxBackoff := r.cfg.BackoffBounds()
333380
if minBackoff <= 0 {

pkg/orchestrator/runner_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,38 @@ func TestRunnerReadyDryRun(t *testing.T) {
548548
}
549549
}
550550

551+
func TestRunnerExpandsRebootCommandPlaceholders(t *testing.T) {
552+
cfg := baseConfig()
553+
cfg.RebootCommand = []string{"/sbin/shutdown", "-r", "now", "coordinated reboot for ${RC_NODE_NAME}"}
554+
lease := &fakeLease{}
555+
engine := &fakeEngine{steps: []evalStep{{requires: true}, {requires: true}}}
556+
healthRunner := &fakeHealth{steps: []healthStep{{result: health.Result{ExitCode: 0}}, {result: health.Result{ExitCode: 0}}}}
557+
locker := &fakeLocker{outcomes: []acquireOutcome{{lease: lease}}}
558+
559+
runner, err := NewRunner(cfg, engine, healthRunner, locker)
560+
if err != nil {
561+
t.Fatalf("failed to create runner: %v", err)
562+
}
563+
564+
outcome, err := runner.RunOnce(context.Background())
565+
if err != nil {
566+
t.Fatalf("unexpected error: %v", err)
567+
}
568+
if outcome.Status != OutcomeReady {
569+
t.Fatalf("expected OutcomeReady, got %s", outcome.Status)
570+
}
571+
if len(outcome.Command) != len(cfg.RebootCommand) {
572+
t.Fatalf("expected command length %d, got %d", len(cfg.RebootCommand), len(outcome.Command))
573+
}
574+
want := "coordinated reboot for " + cfg.NodeName
575+
if got := outcome.Command[len(outcome.Command)-1]; got != want {
576+
t.Fatalf("expected expanded command argument %q, got %q", want, got)
577+
}
578+
if cfg.RebootCommand[len(cfg.RebootCommand)-1] != "coordinated reboot for ${RC_NODE_NAME}" {
579+
t.Fatalf("expected original reboot command to remain unchanged, got %q", cfg.RebootCommand[len(cfg.RebootCommand)-1])
580+
}
581+
}
582+
551583
func TestRunnerHealthError(t *testing.T) {
552584
cfg := baseConfig()
553585
engine := &fakeEngine{steps: []evalStep{{requires: true}}}

0 commit comments

Comments
 (0)