Skip to content

Commit 3e13101

Browse files
- flattened command structure to reduce nesting
- created two separate command: network-partition & network-latency - introduced verbose logging flag
1 parent 85ccefe commit 3e13101

File tree

2 files changed

+143
-216
lines changed

2 files changed

+143
-216
lines changed

pkg/cmd/roachprod/cli/chaos.go

Lines changed: 84 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package cli
88
import (
99
"context"
1010
"fmt"
11+
"io"
1112
"os"
1213
"os/signal"
1314
"strings"
@@ -57,13 +58,19 @@ var (
5758
chaosCertsDir string
5859
chaosReplicationFactor int
5960
chaosStage string
61+
verbose bool
62+
63+
// chaosLogger is the logger used by failure-injection library.
64+
// It is initialized in the chaos command's PersistentPreRunE based on the verbose flag.
65+
chaosLogger *logger.Logger
6066
)
6167

6268
// GlobalChaosOpts captures global chaos flags
6369
type GlobalChaosOpts struct {
6470
WaitBeforeCleanup time.Duration
6571
RunForever bool
6672
Stage FailureStage
73+
Verbose bool
6774
}
6875

6976
// buildChaosCmd creates the root chaos command
@@ -79,6 +86,10 @@ own lifecycle: Setup → Inject → Wait → Recover → Cleanup.
7986
8087
Global flags control the duration and cleanup behavior of all chaos commands.
8188
`,
89+
PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
90+
// Initialize the chaos logger based on verbose flag
91+
return initChaosLogger()
92+
},
8293
}
8394

8495
// Add global flags
@@ -103,9 +114,13 @@ Global flags control the duration and cleanup behavior of all chaos commands.
103114
- recover: runs only the recover phase (removes the failure)
104115
- cleanup: runs only the cleanup phase (removes failure dependencies)
105116
Default: all`)
117+
chaosCmd.PersistentFlags().BoolVar(&verbose,
118+
"verbose", false,
119+
"if set, prints verbose logs from failure-injection library")
106120

107121
// Add subcommands
108-
chaosCmd.AddCommand(cr.buildChaosNetworkCmd())
122+
chaosCmd.AddCommand(cr.buildChaosNetworkPartitionCmd())
123+
chaosCmd.AddCommand(cr.buildChaosNetworkLatencyCmd())
109124

110125
return chaosCmd
111126
}
@@ -148,11 +163,28 @@ func getClusterOptions() []failures.ClusterOptionFunc {
148163
return opts
149164
}
150165

166+
// initChaosLogger initializes the global chaos logger based on the verbose flag.
167+
// This should be called once before any chaos command executes.
168+
func initChaosLogger() error {
169+
cfg := logger.Config{
170+
Stdout: io.Discard,
171+
Stderr: os.Stderr,
172+
}
173+
if verbose {
174+
cfg.Stdout = os.Stdout
175+
}
176+
177+
l, err := cfg.NewLogger("")
178+
if err != nil {
179+
return errors.Wrap(err, "failed to create chaos logger")
180+
}
181+
182+
chaosLogger = l
183+
return nil
184+
}
185+
151186
// parseInt32SliceToNodes converts a uint32 slice to install.Nodes
152187
func parseInt32SliceToNodes(nodes []int32) install.Nodes {
153-
if len(nodes) == 0 {
154-
return nil
155-
}
156188
result := make(install.Nodes, len(nodes))
157189
for i, n := range nodes {
158190
result[i] = install.Node(n)
@@ -165,14 +197,18 @@ func parseInt32SliceToNodes(nodes []int32) install.Nodes {
165197
// For individual stages, state validation is disabled (disableStateValidation=true) to allow
166198
// running stages independently without enforcing the complete lifecycle order.
167199
func createFailer(
168-
clusterName string, failureName string, stage FailureStage, opts ...failures.ClusterOptionFunc,
200+
clusterName string,
201+
failureName string,
202+
chaosOpts GlobalChaosOpts,
203+
opts ...failures.ClusterOptionFunc,
169204
) (*failures.Failer, error) {
170205
registry := failures.GetFailureRegistry()
171-
disableStateValidation := stage != StageAll
206+
disableStateValidation := chaosOpts.Stage != StageAll
207+
172208
return registry.GetFailer(
173209
clusterName,
174210
failureName,
175-
config.Logger,
211+
chaosLogger,
176212
disableStateValidation,
177213
opts...,
178214
)
@@ -182,148 +218,140 @@ func createFailer(
182218
// If stage is StageAll, runs the complete lifecycle: Setup → Inject → Wait → Recover → Cleanup.
183219
// Otherwise, runs only the specified individual stage.
184220
func runFailureLifecycle(
185-
ctx context.Context,
186-
l *logger.Logger,
187-
failer *failures.Failer,
188-
args failures.FailureArgs,
189-
opts GlobalChaosOpts,
221+
ctx context.Context, failer *failures.Failer, args failures.FailureArgs, opts GlobalChaosOpts,
190222
) error {
191223
switch opts.Stage {
192224
case StageSetup:
193-
return runSetupStage(ctx, l, failer, args)
225+
return runSetupStage(ctx, failer, args)
194226
case StageInject:
195-
return runInjectStage(ctx, l, failer, args)
227+
return runInjectStage(ctx, failer, args)
196228
case StageRecover:
197-
return runRecoverStage(ctx, l, failer, args)
229+
return runRecoverStage(ctx, failer, args)
198230
case StageCleanup:
199-
return runCleanupStage(ctx, l, failer, args)
231+
return runCleanupStage(ctx, failer, args)
200232
case StageAll:
201-
return runFullLifecycle(ctx, l, failer, args, opts)
233+
return runFullLifecycle(ctx, failer, args, opts)
202234
default:
203235
return errors.Newf("unknown stage: %s", opts.Stage)
204236
}
205237
}
206238

207239
// runSetupStage runs only the setup phase
208-
func runSetupStage(
209-
ctx context.Context, l *logger.Logger, failer *failures.Failer, args failures.FailureArgs,
210-
) error {
211-
l.Printf("Running setup stage...")
212-
if err := failer.Setup(ctx, l, args); err != nil {
240+
func runSetupStage(ctx context.Context, failer *failures.Failer, args failures.FailureArgs) error {
241+
config.Logger.Printf("Running setup stage...")
242+
if err := failer.Setup(ctx, chaosLogger, args); err != nil {
213243
return errors.Wrap(err, "failed to setup failure")
214244
}
215245

216-
l.Printf("Setup stage completed successfully")
246+
config.Logger.Printf("Setup stage completed successfully")
217247
return nil
218248
}
219249

220250
// runInjectStage runs only the inject phase
221-
func runInjectStage(
222-
ctx context.Context, l *logger.Logger, failer *failures.Failer, args failures.FailureArgs,
223-
) error {
224-
l.Printf("Running inject stage...")
225-
if err := failer.Inject(ctx, l, args); err != nil {
251+
func runInjectStage(ctx context.Context, failer *failures.Failer, args failures.FailureArgs) error {
252+
config.Logger.Printf("Running inject stage...")
253+
if err := failer.Inject(ctx, chaosLogger, args); err != nil {
226254
return errors.Wrap(err, "failed to inject failure")
227255
}
228-
l.Printf("Inject stage completed successfully")
256+
config.Logger.Printf("waiting for failure to propagate")
257+
if err := failer.WaitForFailureToPropagate(ctx, chaosLogger); err != nil {
258+
return errors.Wrap(err, "failed to propagate failure")
259+
}
260+
config.Logger.Printf("Inject stage completed successfully")
229261
return nil
230262
}
231263

232264
// runRecoverStage runs only the recover phase.
233265
// When running recover individually with state validation disabled, we use SetInjectArgs
234266
// to provide the necessary context for recovery without actually running the inject phase.
235267
func runRecoverStage(
236-
ctx context.Context, l *logger.Logger, failer *failures.Failer, args failures.FailureArgs,
268+
ctx context.Context, failer *failures.Failer, args failures.FailureArgs,
237269
) error {
238-
l.Printf("Running recover stage...")
270+
config.Logger.Printf("Running recover stage...")
239271
// Set the inject args directly so Recover() has the necessary context
240272
failer.SetInjectArgs(args)
241-
if err := failer.Recover(ctx, l); err != nil {
273+
if err := failer.Recover(ctx, chaosLogger); err != nil {
242274
return errors.Wrap(err, "failed to recover from failure")
243275
}
244276

245-
if err := failer.WaitForFailureToRecover(ctx, l); err != nil {
277+
if err := failer.WaitForFailureToRecover(ctx, chaosLogger); err != nil {
246278
return errors.Wrap(err, "failed to wait for failure to recover")
247279
}
248280

249-
l.Printf("Recover stage completed successfully")
281+
config.Logger.Printf("Recover stage completed successfully")
250282
return nil
251283
}
252284

253285
// runCleanupStage runs only the cleanup phase.
254286
// When running cleanup individually with state validation disabled, we use SetSetupArgs
255287
// to provide the necessary context for cleanup without actually running the setup phase.
256288
func runCleanupStage(
257-
ctx context.Context, l *logger.Logger, failer *failures.Failer, args failures.FailureArgs,
289+
ctx context.Context, failer *failures.Failer, args failures.FailureArgs,
258290
) error {
259-
l.Printf("Running cleanup stage...")
291+
config.Logger.Printf("Running cleanup stage...")
260292

261293
// Set the setup args directly so Cleanup() has the necessary context
262294
failer.SetSetupArgs(args)
263-
if err := failer.Cleanup(ctx, l); err != nil {
295+
if err := failer.Cleanup(ctx, chaosLogger); err != nil {
264296
return errors.Wrap(err, "failed to cleanup failure")
265297
}
266-
l.Printf("Cleanup stage completed successfully")
298+
config.Logger.Printf("Cleanup stage completed successfully")
267299
return nil
268300
}
269301

270302
// runFullLifecycle executes the complete failure lifecycle:
271303
// Setup → Inject → Wait → Recover → Cleanup
272304
func runFullLifecycle(
273-
ctx context.Context,
274-
l *logger.Logger,
275-
failer *failures.Failer,
276-
args failures.FailureArgs,
277-
opts GlobalChaosOpts,
305+
ctx context.Context, failer *failures.Failer, args failures.FailureArgs, opts GlobalChaosOpts,
278306
) error {
279307
// Ensure cleanup always runs, even if we panic or get interrupted
280308
cleanupDone := false
281309
defer func() {
282310
if !cleanupDone {
283-
l.Printf("Running cleanup due to early exit...")
284-
if err := failer.Cleanup(ctx, l); err != nil {
285-
l.Errorf("Cleanup failed: %v", err)
311+
config.Logger.Printf("Running cleanup due to early exit...")
312+
if err := failer.Cleanup(ctx, chaosLogger); err != nil {
313+
config.Logger.Errorf("Cleanup failed: %v", err)
286314
}
287315
}
288316
}()
289317

290318
// Setup phase
291-
if err := runSetupStage(ctx, l, failer, args); err != nil {
319+
if err := runSetupStage(ctx, failer, args); err != nil {
292320
return err
293321
}
294322

295323
// Inject phase
296-
if err := runInjectStage(ctx, l, failer, args); err != nil {
324+
if err := runInjectStage(ctx, failer, args); err != nil {
297325
return err
298326
}
299327

300328
// Wait phase
301329
if opts.RunForever {
302-
l.Printf("Failure injected. Waiting for interrupt (Ctrl+C)...")
330+
config.Logger.Printf("Failure injected. Waiting for interrupt (Ctrl+C)...")
303331
<-waitForInterrupt()
304-
l.Printf("Interrupt received. Beginning recovery...")
332+
config.Logger.Printf("Interrupt received. Beginning recovery...")
305333
} else {
306-
l.Printf("Failure injected. Waiting %s before recovery...", opts.WaitBeforeCleanup)
334+
config.Logger.Printf("Failure injected. Waiting %s before recovery...", opts.WaitBeforeCleanup)
307335
select {
308336
case <-time.After(opts.WaitBeforeCleanup):
309-
l.Printf("Wait period complete. Beginning recovery...")
337+
config.Logger.Printf("Wait period complete. Beginning recovery...")
310338
case <-waitForInterrupt():
311-
l.Printf("Interrupt received. Beginning recovery...")
339+
config.Logger.Printf("Interrupt received. Beginning recovery...")
312340
}
313341
}
314342

315343
// Recover phase
316-
if err := runRecoverStage(ctx, l, failer, args); err != nil {
344+
if err := runRecoverStage(ctx, failer, args); err != nil {
317345
return err
318346
}
319347

320348
// Cleanup phase
321-
if err := runCleanupStage(ctx, l, failer, args); err != nil {
349+
if err := runCleanupStage(ctx, failer, args); err != nil {
322350
return err
323351
}
324352

325353
cleanupDone = true
326-
l.Printf("Failure lifecycle completed successfully")
354+
config.Logger.Printf("Failure lifecycle completed successfully")
327355
return nil
328356
}
329357

0 commit comments

Comments
 (0)