diff --git a/cmd/dryrun.go b/cmd/dryrun.go new file mode 100644 index 00000000..16fedd10 --- /dev/null +++ b/cmd/dryrun.go @@ -0,0 +1,135 @@ +package cmd + +import ( + "fmt" + "strings" + + "github.com/fatih/color" + "github.com/krkn-chaos/krknctl/pkg/provider/models" + "github.com/krkn-chaos/krknctl/pkg/typing" +) + +const dryRunClientValue = "client" + +// DryRunResult holds the outcome of a local validation pass. +type DryRunResult struct { + Valid bool + Messages []dryRunMessage +} + +type dryRunMessage struct { + ok bool + text string +} + +func (r *DryRunResult) addOK(msg string) { + r.Messages = append(r.Messages, dryRunMessage{ok: true, text: msg}) +} + +func (r *DryRunResult) addFail(msg string) { + r.Valid = false + r.Messages = append(r.Messages, dryRunMessage{ok: false, text: msg}) +} + +// Print writes the validation results to stdout using the project colour scheme. +func (r *DryRunResult) Print() { + green := color.New(color.FgGreen).SprintFunc() + red := color.New(color.FgHiRed).SprintFunc() + for _, m := range r.Messages { + if m.ok { + fmt.Printf("%s %s\n", green("✔"), m.text) + } else { + fmt.Printf("%s %s\n", red("✖"), m.text) + } + } +} + +// parseDryRunFlag extracts the --dry-run flag value from raw args. +// Returns ("", false, nil) when the flag is absent. +// Returns an error when the flag is present but has an unsupported value. +func parseDryRunFlag(args []string) (string, bool, error) { + value, found, err := ParseArgValue(args, "--dry-run") + if err != nil { + return "", false, fmt.Errorf("--dry-run: %w", err) + } + if !found { + return "", false, nil + } + if strings.ToLower(value) != dryRunClientValue { + return "", false, fmt.Errorf("--dry-run only accepts %q, got %q", dryRunClientValue, value) + } + return value, true, nil +} + +// validateScenarioLocally performs client-side validation of a scenario's +// input fields against the values supplied in args. +// It never contacts the cluster, the container runtime, or the image registry. +// +// scenarioDetail and globalDetail may be nil (e.g. when the registry is +// unreachable in dry-run mode); in that case only the args themselves are +// checked for well-formedness. +func validateScenarioLocally( + scenarioDetail *models.ScenarioDetail, + globalDetail *models.ScenarioDetail, + args []string, +) *DryRunResult { + result := &DryRunResult{Valid: true} + + // ── 1. schema present ──────────────────────────────────────────────────── + if scenarioDetail == nil { + result.addFail("scenario schema not found") + return result + } + result.addOK("Scenario schema valid") + + // Build a single flat slice of all fields without mutating the originals. + // Using append on a nil/empty base avoids the capacity-aliasing bug where + // append(scenarioDetail.Fields, ...) could overwrite scenarioDetail.Fields. + var allFields []typing.InputField + allFields = append(allFields, scenarioDetail.Fields...) + if globalDetail != nil { + allFields = append(allFields, globalDetail.Fields...) + } + + // ── 2. required fields present ─────────────────────────────────────────── + missingRequired := false + for _, field := range allFields { + if !field.Required { + continue + } + flagName := fmt.Sprintf("--%s", *field.Name) + _, found, _ := ParseArgValue(args, flagName) + hasDefault := field.Default != nil && *field.Default != "" + if !found && !hasDefault { + result.addFail(fmt.Sprintf("Missing required field: %s", *field.Name)) + missingRequired = true + } + } + if !missingRequired { + result.addOK("All required fields present") + } + + // ── 3. validate each supplied value ────────────────────────────────────── + validationFailed := false + for _, field := range allFields { + flagName := fmt.Sprintf("--%s", *field.Name) + rawValue, found, err := ParseArgValue(args, flagName) + if err != nil { + result.addFail(fmt.Sprintf("Flag parse error for %s: %v", *field.Name, err)) + validationFailed = true + continue + } + if !found { + continue // not supplied — required check already handled above + } + if _, err := field.Validate(&rawValue); err != nil { + result.addFail(fmt.Sprintf("Invalid value for %s (%s): %v", *field.Name, field.Type.String(), err)) + validationFailed = true + } + } + if !validationFailed { + result.addOK("Values validated") + } + + return result +} diff --git a/cmd/dryrun_test.go b/cmd/dryrun_test.go new file mode 100644 index 00000000..9ba0b2cf --- /dev/null +++ b/cmd/dryrun_test.go @@ -0,0 +1,217 @@ +package cmd + +import ( + "strings" + "testing" + + "github.com/krkn-chaos/krknctl/pkg/provider/models" + "github.com/krkn-chaos/krknctl/pkg/typing" + "github.com/stretchr/testify/assert" +) + +// ── helpers ────────────────────────────────────────────────────────────────── + +func strPtr(s string) *string { return &s } + +// makeField builds a typing.InputField directly without JSON round-tripping, +// so Required and Default are never silently reset to zero values. +func makeField(name, variable string, typ typing.Type, required bool, defaultVal *string) typing.InputField { + return typing.InputField{ + Name: strPtr(name), + Variable: strPtr(variable), + Description: strPtr(name + " description"), + Type: typ, + Required: required, + Default: defaultVal, + } +} + +func emptyGlobal() *models.ScenarioDetail { + return &models.ScenarioDetail{} +} + +// ── parseDryRunFlag ─────────────────────────────────────────────────────────── + +func TestParseDryRunFlag_NotPresent(t *testing.T) { + _, found, err := parseDryRunFlag([]string{"my-scenario", "--kubeconfig", "/tmp/kube"}) + assert.Nil(t, err) + assert.False(t, found) +} + +func TestParseDryRunFlag_ValidClient(t *testing.T) { + _, found, err := parseDryRunFlag([]string{"my-scenario", "--dry-run=client"}) + assert.Nil(t, err) + assert.True(t, found) +} + +func TestParseDryRunFlag_ValidClientSpaceSeparated(t *testing.T) { + _, found, err := parseDryRunFlag([]string{"my-scenario", "--dry-run", "client"}) + assert.Nil(t, err) + assert.True(t, found) +} + +func TestParseDryRunFlag_InvalidValue(t *testing.T) { + _, found, err := parseDryRunFlag([]string{"my-scenario", "--dry-run=server"}) + assert.NotNil(t, err) + assert.False(t, found) + assert.Contains(t, err.Error(), "client") +} + +func TestParseDryRunFlag_MissingValue(t *testing.T) { + _, found, err := parseDryRunFlag([]string{"my-scenario", "--dry-run"}) + assert.NotNil(t, err) + assert.False(t, found) +} + +// ── validateScenarioLocally ─────────────────────────────────────────────────── + +func TestValidateScenarioLocally_NilScenario(t *testing.T) { + result := validateScenarioLocally(nil, emptyGlobal(), []string{}) + assert.False(t, result.Valid) + assertHasFail(t, result, "schema not found") +} + +func TestValidateScenarioLocally_ValidConfig(t *testing.T) { + dur := "10" + scenario := &models.ScenarioDetail{ + Fields: []typing.InputField{ + makeField("duration", "DURATION", typing.Number, true, &dur), + }, + } + args := []string{"my-scenario", "--duration=30"} + result := validateScenarioLocally(scenario, emptyGlobal(), args) + assert.True(t, result.Valid) + assertHasOK(t, result, "Scenario schema valid") + assertHasOK(t, result, "All required fields present") + assertHasOK(t, result, "Values validated") +} + +func TestValidateScenarioLocally_MissingRequiredField(t *testing.T) { + scenario := &models.ScenarioDetail{ + Fields: []typing.InputField{ + makeField("namespace", "NAMESPACE", typing.String, true, nil), + }, + } + result := validateScenarioLocally(scenario, emptyGlobal(), []string{"my-scenario"}) + assert.False(t, result.Valid) + assertHasFail(t, result, "namespace") +} + +func TestValidateScenarioLocally_RequiredFieldWithDefault_Valid(t *testing.T) { + def := "default-ns" + scenario := &models.ScenarioDetail{ + Fields: []typing.InputField{ + makeField("namespace", "NAMESPACE", typing.String, true, &def), + }, + } + // no --namespace supplied; default covers the required constraint + result := validateScenarioLocally(scenario, emptyGlobal(), []string{"my-scenario"}) + assert.True(t, result.Valid) +} + +func TestValidateScenarioLocally_InvalidType_Number(t *testing.T) { + scenario := &models.ScenarioDetail{ + Fields: []typing.InputField{ + makeField("duration", "DURATION", typing.Number, false, nil), + }, + } + args := []string{"my-scenario", "--duration=notanumber"} + result := validateScenarioLocally(scenario, emptyGlobal(), args) + assert.False(t, result.Valid) + assertHasFail(t, result, "duration") +} + +func TestValidateScenarioLocally_InvalidType_Boolean(t *testing.T) { + scenario := &models.ScenarioDetail{ + Fields: []typing.InputField{ + makeField("verbose", "VERBOSE", typing.Boolean, false, nil), + }, + } + args := []string{"my-scenario", "--verbose=maybe"} + result := validateScenarioLocally(scenario, emptyGlobal(), args) + assert.False(t, result.Valid) + assertHasFail(t, result, "verbose") +} + +func TestValidateScenarioLocally_GlobalFieldsValidated(t *testing.T) { + scenario := &models.ScenarioDetail{} + global := &models.ScenarioDetail{ + Fields: []typing.InputField{ + makeField("log-level", "LOG_LEVEL", typing.Number, false, nil), + }, + } + args := []string{"my-scenario", "--log-level=bad"} + result := validateScenarioLocally(scenario, global, args) + assert.False(t, result.Valid) + assertHasFail(t, result, "log-level") +} + +func TestValidateScenarioLocally_NilGlobalDetail(t *testing.T) { + scenario := &models.ScenarioDetail{ + Fields: []typing.InputField{ + makeField("duration", "DURATION", typing.Number, false, nil), + }, + } + // nil globalDetail must not panic + result := validateScenarioLocally(scenario, nil, []string{"my-scenario", "--duration=5"}) + assert.True(t, result.Valid) +} + +func TestValidateScenarioLocally_MultipleErrors(t *testing.T) { + scenario := &models.ScenarioDetail{ + Fields: []typing.InputField{ + makeField("namespace", "NAMESPACE", typing.String, true, nil), + makeField("duration", "DURATION", typing.Number, true, nil), + }, + } + // both required, neither supplied + result := validateScenarioLocally(scenario, emptyGlobal(), []string{"my-scenario"}) + assert.False(t, result.Valid) + failCount := 0 + for _, m := range result.Messages { + if !m.ok { + failCount++ + } + } + assert.Equal(t, 2, failCount) +} + +// ── DryRunResult.Print (smoke test — just ensure no panic) ─────────────────── + +func TestDryRunResult_Print_NoFields(t *testing.T) { + r := &DryRunResult{Valid: true} + assert.NotPanics(t, func() { r.Print() }) +} + +func TestDryRunResult_Print_Mixed(t *testing.T) { + r := &DryRunResult{Valid: false} + r.addOK("Scenario schema valid") + r.addFail("Missing required field: namespace") + assert.NotPanics(t, func() { r.Print() }) +} + +// ── assertion helpers ───────────────────────────────────────────────────────── + +func assertHasOK(t *testing.T, r *DryRunResult, substr string) { + t.Helper() + for _, m := range r.Messages { + if m.ok && containsCI(m.text, substr) { + return + } + } + t.Errorf("expected an OK message containing %q, got: %+v", substr, r.Messages) +} + +func assertHasFail(t *testing.T, r *DryRunResult, substr string) { + t.Helper() + for _, m := range r.Messages { + if !m.ok && containsCI(m.text, substr) { + return + } + } + t.Errorf("expected a FAIL message containing %q, got: %+v", substr, r.Messages) +} + +func containsCI(s, sub string) bool { + return strings.Contains(strings.ToLower(s), strings.ToLower(sub)) +} diff --git a/cmd/graph.go b/cmd/graph.go index 2ec9cafc..c1e40d9a 100644 --- a/cmd/graph.go +++ b/cmd/graph.go @@ -195,7 +195,7 @@ func NewGraphRunCommand(factory *providerfactory.ProviderFactory, scenarioOrches commChannel := make(chan *models.GraphCommChannel) go func() { - (*scenarioOrchestrator).RunGraph(nodes, executionPlan, environment, volumes, false, commChannel, registrySettings, nil) + (*scenarioOrchestrator).RunGraph(nodes, executionPlan, environment, volumes, false, commChannel, registrySettings, nil, "") }() for { diff --git a/cmd/random.go b/cmd/random.go index f3d4ad87..ca4c2cb0 100644 --- a/cmd/random.go +++ b/cmd/random.go @@ -6,7 +6,10 @@ import ( "fmt" "log" "os" + "os/exec" + "path/filepath" "strings" + "syscall" "time" "github.com/fatih/color" @@ -15,6 +18,7 @@ import ( providerfactory "github.com/krkn-chaos/krknctl/pkg/provider/factory" providermodels "github.com/krkn-chaos/krknctl/pkg/provider/models" "github.com/krkn-chaos/krknctl/pkg/randomgraph" + "github.com/krkn-chaos/krknctl/pkg/randomstate" "github.com/krkn-chaos/krknctl/pkg/scenarioorchestrator" "github.com/krkn-chaos/krknctl/pkg/scenarioorchestrator/models" "github.com/krkn-chaos/krknctl/pkg/scenarioorchestrator/utils" @@ -22,246 +26,409 @@ import ( "github.com/spf13/cobra" ) +// NewRandomCommand is the parent group command; it takes no positional args. func NewRandomCommand() *cobra.Command { - var command = &cobra.Command{ + return &cobra.Command{ Use: "random", Short: "runs or scaffolds a random chaos run based on a json test plan", Long: `runs or scaffolds a random chaos run based on a json test plan`, - Args: cobra.ExactArgs(1), RunE: func(cmd *cobra.Command, args []string) error { return cmd.Help() }, } - return command } -func NewRandomRunCommand(factory *providerfactory.ProviderFactory, scenarioOrchestrator *scenarioorchestrator.ScenarioOrchestrator, config config.Config) *cobra.Command { - var command = &cobra.Command{ +// runRandomPlan is the shared execution core for both foreground and background +// modes. Called directly when attached, or from the re-spawned child when detached. +func runRandomPlan( + cmd *cobra.Command, + args []string, + factory *providerfactory.ProviderFactory, + scenarioOrchestrator *scenarioorchestrator.ScenarioOrchestrator, + cfg config.Config, +) error { + registrySettings, err := providermodels.NewRegistryV2FromEnv(cfg) + if err != nil { + return err + } + if registrySettings == nil { + registrySettings, err = parsePrivateRepoArgs(cmd, nil) + if err != nil { + return err + } + } + + (*scenarioOrchestrator).PrintContainerRuntime() + if registrySettings != nil { + logPrivateRegistry(registrySettings.RegistryURL) + } + + // ── log directory ──────────────────────────────────────────────────────── + logDir, err := cmd.Flags().GetString("log-dir") + if err != nil { + return err + } + if logDir != "" { + expanded, err := commonutils.ExpandFolder(logDir, nil) + if err != nil { + return err + } + logDir = *expanded + if err = os.MkdirAll(logDir, 0750); err != nil { + return fmt.Errorf("creating log directory %s: %w", logDir, err) + } + } + + // ── standard flags ─────────────────────────────────────────────────────── + volumes := make(map[string]string) + environment := make(map[string]string) + + kubeconfig, err := cmd.Flags().GetString("kubeconfig") + if err != nil { + return err + } + if kubeconfig != "" { + expandedConfig, err := commonutils.ExpandFolder(kubeconfig, nil) + if err != nil { + return err + } + kubeconfig = *expandedConfig + if !CheckFileExists(kubeconfig) { + return fmt.Errorf("file %s does not exist", kubeconfig) + } + } + + alertsProfile, err := cmd.Flags().GetString("alerts-profile") + if err != nil { + return err + } + if alertsProfile != "" { + expandedProfile, err := commonutils.ExpandFolder(alertsProfile, nil) + if err != nil { + return err + } + alertsProfile = *expandedProfile + if !CheckFileExists(alertsProfile) { + return fmt.Errorf("file %s does not exist", alertsProfile) + } + } + + metricsProfile, err := cmd.Flags().GetString("metrics-profile") + if err != nil { + return err + } + if metricsProfile != "" { + expandedProfile, err := commonutils.ExpandFolder(metricsProfile, nil) + if err != nil { + return err + } + metricsProfile = *expandedProfile + if !CheckFileExists(metricsProfile) { + return fmt.Errorf("file %s does not exist", metricsProfile) + } + } + + maxParallel, err := cmd.Flags().GetInt("max-parallel") + if err != nil { + return err + } + numberOfScenarios, err := cmd.Flags().GetInt("number-of-scenarios") + if err != nil { + return err + } + exitOnError, err := cmd.Flags().GetBool("exit-on-error") + if err != nil { + return err + } + randomGraphFile, err := cmd.Flags().GetString("graph-dump") + if err != nil { + return err + } + if randomGraphFile == "" { + randomGraphFile = fmt.Sprintf(cfg.RandomGraphPath, time.Now().Unix()) + } + + kubeconfigPath, err := utils.PrepareKubeconfig(&kubeconfig, cfg) + if err != nil { + return err + } + if kubeconfigPath == nil { + return fmt.Errorf("kubeconfig not found: %s", kubeconfig) + } + volumes[*kubeconfigPath] = cfg.KubeconfigPath + + if metricsProfile != "" { + volumes[metricsProfile] = cfg.MetricsProfilePath + } + if alertsProfile != "" { + volumes[alertsProfile] = cfg.AlertsProfilePath + } + + // ── load plan ──────────────────────────────────────────────────────────── + planFile := args[0] + fileData, err := os.ReadFile(planFile) + if err != nil { + return fmt.Errorf("failed to open scenario file: %s", planFile) + } + nodes := make(map[string]models.ScenarioNode) + if err = json.Unmarshal(fileData, &nodes); err != nil { + return err + } + + dataProvider := GetProvider(registrySettings != nil, factory) + + // ── validate ───────────────────────────────────────────────────────────── + spinner := NewSpinnerWithSuffix(" validating chaos plan...") + nameChannel := make(chan *struct { + name *string + err error + }) + spinner.Start() + go func() { + validateGraphScenarioInput(dataProvider, nodes, nameChannel, registrySettings) + }() + for { + result := <-nameChannel + if result == nil { + break + } + if result.err != nil { + spinner.Stop() + return fmt.Errorf("failed to validate scenario %s: %w", *result.name, result.err) + } + if result.name != nil { + spinner.Suffix = fmt.Sprintf(" validating scenario: %s", *result.name) + } + } + spinner.Stop() + + // ── build execution plan ───────────────────────────────────────────────── + executionPlan := randomgraph.NewRandomGraph(nodes, int64(maxParallel), numberOfScenarios) + if err = DumpRandomGraph(nodes, executionPlan, randomGraphFile, cfg.LabelRootNode); err != nil { + return err + } + if len(executionPlan) == 0 { + _, _ = color.New(color.FgYellow).Println("no scenario to execute; the random graph file appears to be empty (single-node graphs are not supported)") + return nil + } + + table, err := NewGraphTable(executionPlan, cfg) + if err != nil { + return err + } + table.Print() + fmt.Print("\n\n") + + // ── persist state ──────────────────────────────────────────────────────── + state := &randomstate.State{ + PID: os.Getpid(), + ScenarioName: filepath.Base(planFile), // display name, not the full path + PlanFile: planFile, + StartTime: time.Now(), + LogDir: logDir, + } + if err = randomstate.SaveState(state); err != nil { + return fmt.Errorf("saving run state: %w", err) + } + defer func() { _ = randomstate.ClearState() }() + + // ── run ────────────────────────────────────────────────────────────────── + spinner.Suffix = " starting chaos scenarios..." + spinner.Start() + + commChannel := make(chan *models.GraphCommChannel) + go func() { + // logDir is forwarded so CommonRunGraph writes log files to the right place + (*scenarioOrchestrator).RunGraph(nodes, executionPlan, environment, volumes, false, commChannel, registrySettings, nil, logDir) + }() + + for { + c := <-commChannel + if c == nil { + break + } + if c.Err != nil { + spinner.Stop() + var exitErr *utils.ExitError + if errors.As(c.Err, &exitErr) { + if c.ScenarioID != nil && c.ScenarioLogFile != nil { + _, _ = color.New(color.FgHiRed).Println(fmt.Sprintf( + "scenario %s at step %d exited with status %d, log: %s", + *c.ScenarioID, *c.Layer, exitErr.ExitStatus, *c.ScenarioLogFile)) + } + if exitOnError { + _, _ = color.New(color.FgHiRed).Printf("aborting chaos run with exit status %d\n", exitErr.ExitStatus) + os.Exit(exitErr.ExitStatus) + } + spinner.Start() + } else { + // non-exit error (runtime / orchestration failure) — surface immediately + return c.Err + } + } + if c != nil && c.Layer != nil { + spinner.Suffix = fmt.Sprintf(" running step %d: %s", *c.Layer, strings.Join(executionPlan[*c.Layer], ", ")) + } + } + spinner.Stop() + return nil +} + +func NewRandomRunCommand(factory *providerfactory.ProviderFactory, scenarioOrchestrator *scenarioorchestrator.ScenarioOrchestrator, cfg config.Config) *cobra.Command { + return &cobra.Command{ Use: "run", Short: "runs a random chaos run", Long: `runs a random run based on a json test plan`, Args: cobra.ExactArgs(1), RunE: func(cmd *cobra.Command, args []string) error { - registrySettings, err := providermodels.NewRegistryV2FromEnv(config) - if err != nil { - return err - } - if registrySettings == nil { - registrySettings, err = parsePrivateRepoArgs(cmd, nil) - if err != nil { - return err - } - } - (*scenarioOrchestrator).PrintContainerRuntime() - if registrySettings != nil { - logPrivateRegistry(registrySettings.RegistryURL) - } - spinner := NewSpinnerWithSuffix("running randomly generated chaos plan...") - volumes := make(map[string]string) - environment := make(map[string]string) - kubeconfig, err := cmd.Flags().GetString("kubeconfig") - if err != nil { - return err - } - if kubeconfig != "" { - expandedConfig, err := commonutils.ExpandFolder(kubeconfig, nil) - if err != nil { - return err - } - kubeconfig = *expandedConfig - if !CheckFileExists(kubeconfig) { - return fmt.Errorf("file %s does not exist", kubeconfig) - } - } - alertsProfile, err := cmd.Flags().GetString("alerts-profile") - if err != nil { - return err - } - if alertsProfile != "" { - expandedProfile, err := commonutils.ExpandFolder(alertsProfile, nil) - if err != nil { - return err - } - alertsProfile = *expandedProfile - if !CheckFileExists(alertsProfile) { - return fmt.Errorf("file %s does not exist", alertsProfile) - } - } - metricsProfile, err := cmd.Flags().GetString("metrics-profile") + detach, err := cmd.Flags().GetBool("detach") if err != nil { return err } - if metricsProfile != "" { - expandedProfile, err := commonutils.ExpandFolder(metricsProfile, nil) - if err != nil { - return err - } - metricsProfile = *expandedProfile - if !CheckFileExists(metricsProfile) { - return fmt.Errorf("file %s does not exist", metricsProfile) - } - } - maxParallel, err := cmd.Flags().GetInt("max-parallel") - if err != nil { - return err + if detach { + return runDetached() } + return runRandomPlan(cmd, args, factory, scenarioOrchestrator, cfg) + }, + } +} - numberOfScenarios, err := cmd.Flags().GetInt("number-of-scenarios") - if err != nil { - return err - } +// runDetached re-executes the current binary without --detach/-d so the child +// runs the actual plan in the foreground while the parent returns immediately. +// No args parameter needed — everything is already in os.Args. +func runDetached() error { + exe, err := os.Executable() + if err != nil { + return fmt.Errorf("resolving executable path: %w", err) + } - exitOnerror, err := cmd.Flags().GetBool("exit-on-error") - if err != nil { - return err - } + // Forward all original args, stripping --detach / -d in every token form. + var childArgs []string + for _, a := range os.Args[1:] { + if a == "--detach" || a == "-d" || + strings.HasPrefix(a, "--detach=") || strings.HasPrefix(a, "-d=") { + continue + } + childArgs = append(childArgs, a) + } - randomGraphFile, err := cmd.Flags().GetString("graph-dump") - if err != nil { - return err - } + child := exec.Command(exe, childArgs...) // #nosec G204 -- exe comes from os.Executable(), not user input + child.Stdout = nil + child.Stderr = nil + child.Stdin = nil - if randomGraphFile == "" { - randomGraphFile = fmt.Sprintf(config.RandomGraphPath, time.Now().Unix()) - } + if err = child.Start(); err != nil { + return fmt.Errorf("starting background process: %w", err) + } - kubeconfigPath, err := utils.PrepareKubeconfig(&kubeconfig, config) + _, _ = color.New(color.FgGreen).Printf("🚀 chaos plan started in background (PID %d)\n", child.Process.Pid) + return nil +} + +// NewRandomStatusCommand returns the `random status` subcommand. +func NewRandomStatusCommand() *cobra.Command { + return &cobra.Command{ + Use: "status", + Short: "shows the status of a running random chaos plan", + Long: `shows the status of a running random chaos plan`, + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + state, err := randomstate.LoadState() if err != nil { - return err + return fmt.Errorf("reading state: %w", err) } - if kubeconfigPath == nil { - return fmt.Errorf("kubeconfig not found: %s", kubeconfig) + if state == nil { + _, _ = color.New(color.FgYellow).Println("no random chaos plan is currently running") + return nil } - volumes[*kubeconfigPath] = config.KubeconfigPath - if metricsProfile != "" { - volumes[metricsProfile] = config.MetricsProfilePath - } + running := isProcessAlive(state.PID) - if alertsProfile != "" { - volumes[alertsProfile] = config.AlertsProfilePath - } + green := color.New(color.FgGreen).SprintFunc() + red := color.New(color.FgRed).SprintFunc() + bold := color.New(color.Bold).SprintFunc() - file, err := os.ReadFile(args[0]) - if err != nil { - return fmt.Errorf("failed to open scenario file: %s", args[0]) + runningStr := red("false") + if running { + runningStr = green("true") } - nodes := make(map[string]models.ScenarioNode) - err = json.Unmarshal(file, &nodes) - - if err != nil { - return err - } - privateRegistry := false - if registrySettings != nil { - privateRegistry = true - } - dataProvider := GetProvider(privateRegistry, factory) - - nameChannel := make(chan *struct { - name *string - err error - }) - spinner.Start() - go func() { - validateGraphScenarioInput(dataProvider, nodes, nameChannel, registrySettings) - }() - - for { - validateResult := <-nameChannel - if validateResult == nil { - break - } - if validateResult.err != nil { - return fmt.Errorf("failed to validate scenario: %s, error: %s", *validateResult.name, validateResult.err) - } - if validateResult.name != nil { - spinner.Suffix = fmt.Sprintf("validating input for scenario: %s", *validateResult.name) - } + fmt.Printf("%s %s\n", bold("running:"), runningStr) + fmt.Printf("%s %s\n", bold("scenario:"), state.ScenarioName) + fmt.Printf("%s %s\n", bold("plan-file:"), state.PlanFile) + fmt.Printf("%s %d\n", bold("pid:"), state.PID) + fmt.Printf("%s %s\n", bold("started:"), state.StartTime.Format(time.RFC3339)) + if state.LogDir != "" { + fmt.Printf("%s %s\n", bold("log-dir:"), state.LogDir) } - spinner.Stop() + if !running { + _, _ = color.New(color.FgYellow).Println("\nprocess is no longer alive; clearing stale state") + _ = randomstate.ClearState() + } + return nil + }, + } +} - executionPlan := randomgraph.NewRandomGraph(nodes, int64(maxParallel), - numberOfScenarios) - if err = DumpRandomGraph(nodes, executionPlan, randomGraphFile, config.LabelRootNode); err != nil { - return err +// NewRandomAbortCommand returns the `random abort` subcommand. +func NewRandomAbortCommand() *cobra.Command { + return &cobra.Command{ + Use: "abort", + Short: "stops a running random chaos plan", + Long: `stops a running random chaos plan and cleans up its state`, + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + state, err := randomstate.LoadState() + if err != nil { + return fmt.Errorf("reading state: %w", err) } - if len(executionPlan) == 0 { - _, err = color.New(color.FgYellow).Println("No scenario to execute; the random graph file appears to be empty (single-node graphs are not supported).") - if err != nil { - return err - } + if state == nil { + _, _ = color.New(color.FgYellow).Println("no random chaos plan is currently running") return nil } - table, err := NewGraphTable(executionPlan, config) + process, err := os.FindProcess(state.PID) if err != nil { - return err + _ = randomstate.ClearState() + return fmt.Errorf("finding process %d: %w", state.PID, err) } - table.Print() - fmt.Print("\n\n") - spinner.Suffix = "starting chaos scenarios..." - spinner.Start() - - commChannel := make(chan *models.GraphCommChannel) - - go func() { - (*scenarioOrchestrator).RunGraph(nodes, executionPlan, environment, volumes, false, commChannel, registrySettings, nil) - }() - - for { - c := <-commChannel - if c == nil { - break - } else { - if c.Err != nil { - spinner.Stop() - var statErr *utils.ExitError - if errors.As(c.Err, &statErr) { - if c.ScenarioID != nil && c.ScenarioLogFile != nil { - _, err = color.New(color.FgHiRed).Println(fmt.Sprintf("scenario %s at step %d with exit status %d, check log file %s aborting chaos run.", - *c.ScenarioID, - *c.Layer, - statErr.ExitStatus, - *c.ScenarioLogFile)) - if err != nil { - return err - } - } - if exitOnerror { - _, err = color.New(color.FgHiRed).Println(fmt.Sprintf("aborting chaos run with exit status %d", statErr.ExitStatus)) - if err != nil { - return err - } - os.Exit(statErr.ExitStatus) - } - spinner.Start() - } - - } - spinner.Suffix = fmt.Sprintf("Running step %d scenario(s): %s", *c.Layer, strings.Join(executionPlan[*c.Layer], ", ")) - } + if err = process.Kill(); err != nil { + _ = randomstate.ClearState() + return fmt.Errorf("killing process %d: %w", state.PID, err) + } + if err = randomstate.ClearState(); err != nil { + return fmt.Errorf("clearing state: %w", err) } - spinner.Stop() + _, _ = color.New(color.FgGreen).Printf("✅ chaos plan (PID %d, scenario: %s) aborted successfully\n", + state.PID, state.ScenarioName) return nil }, } - return command } -func NewRandomScaffoldCommand(factory *providerfactory.ProviderFactory, config config.Config) *cobra.Command { - var command = &cobra.Command{ +// isProcessAlive probes whether a process is still running using signal 0. +// syscall.Signal(0) checks existence without delivering an actual signal. +// os.Signal(nil) must NOT be used — it is a nil interface, not signal 0. +func isProcessAlive(pid int) bool { + process, err := os.FindProcess(pid) + if err != nil { + return false + } + return process.Signal(syscall.Signal(0)) == nil +} + +func NewRandomScaffoldCommand(factory *providerfactory.ProviderFactory, cfg config.Config) *cobra.Command { + return &cobra.Command{ Use: "scaffold", Short: "scaffolds a random chaos run", Long: `scaffolds a random run based on a json test plan`, Args: cobra.MinimumNArgs(0), ValidArgsFunction: func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - registrySettings, err := providermodels.NewRegistryV2FromEnv(config) + registrySettings, err := providermodels.NewRegistryV2FromEnv(cfg) if err != nil { return nil, cobra.ShellCompDirectiveError } @@ -271,22 +438,17 @@ func NewRandomScaffoldCommand(factory *providerfactory.ProviderFactory, config c return nil, cobra.ShellCompDirectiveError } } - if err != nil { - log.Fatalf("Error fetching scenarios: %v", err) - return nil, cobra.ShellCompDirectiveError - } dataProvider := GetProvider(registrySettings != nil, factory) scenarios, err := FetchScenarios(dataProvider, registrySettings) if err != nil { log.Fatalf("Error fetching scenarios: %v", err) return []string{}, cobra.ShellCompDirectiveError } - return *scenarios, cobra.ShellCompDirectiveNoFileComp }, RunE: func(cmd *cobra.Command, args []string) error { var seed *provider.ScaffoldSeed - registrySettings, err := providermodels.NewRegistryV2FromEnv(config) + registrySettings, err := providermodels.NewRegistryV2FromEnv(cfg) if err != nil { return err } @@ -309,7 +471,6 @@ func NewRandomScaffoldCommand(factory *providerfactory.ProviderFactory, config c if err != nil { return err } - if seedFile != "" { seedFilePath, err := commonutils.ExpandFolder(seedFile, nil) if err != nil { @@ -322,12 +483,9 @@ func NewRandomScaffoldCommand(factory *providerfactory.ProviderFactory, config c NumberOfScenarios: numberOfScenarios, Path: *seedFilePath, } - } else { - if len(args) == 0 { - return fmt.Errorf("please provide at least one scenario") - } + } else if len(args) == 0 { + return fmt.Errorf("please provide at least one scenario") } - output, err := dataProvider.ScaffoldScenarios(args, includeGlobalEnv, registrySettings, true, seed) if err != nil { return err @@ -336,5 +494,4 @@ func NewRandomScaffoldCommand(factory *providerfactory.ProviderFactory, config c return nil }, } - return command } diff --git a/cmd/root.go b/cmd/root.go index e3b4a798..debac3fb 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -63,6 +63,7 @@ func Execute(providerFactory *factory.ProviderFactory, scenarioOrchestrator *sce runCmd.LocalFlags().String("alerts-profile", "", "custom alerts profile file path") runCmd.LocalFlags().String("metrics-profile", "", "custom metrics profile file path") runCmd.LocalFlags().Bool("detached", false, "if set this flag will run in detached mode") + runCmd.LocalFlags().String("dry-run", "", "validate scenario locally without cluster calls; only accepted value: client (e.g. --dry-run=client)") runCmd.DisableFlagParsing = true rootCmd.AddCommand(runCmd) @@ -93,12 +94,19 @@ func Execute(providerFactory *factory.ProviderFactory, scenarioOrchestrator *sce randomRunCmd.Flags().Int("number-of-scenarios", 0, "allows you to specify the number of elements to select from the execution plan") randomRunCmd.Flags().Bool("exit-on-error", false, "if set this flag will the workflow will be interrupted and the tool will exit with a status greater than 0") randomRunCmd.Flags().String("graph-dump", "", "specifies the name of the file where the randomly generated dependency graph will be persisted") + randomRunCmd.Flags().BoolP("detach", "d", false, "run in background and return immediately; default is foreground (attached)") + randomRunCmd.Flags().String("log-dir", "", "directory to store scenario log files (created if it does not exist)") err := randomRunCmd.MarkFlagRequired("max-parallel") if err != nil { fmt.Println("Error marking flag as required:", err) os.Exit(1) } + randomStatusCmd := NewRandomStatusCommand() + randomAbortCmd := NewRandomAbortCommand() + randomCmd.AddCommand(randomStatusCmd) + randomCmd.AddCommand(randomAbortCmd) + randomScaffoldCmd := NewRandomScaffoldCommand(providerFactory, config) randomScaffoldCmd.Flags().Bool("global-env", false, "if set this flag will add global environment variables to each scenario in the graph") randomScaffoldCmd.Flags().String("seed-file", "", "template file with already configured scenarios used to generate the random test plan") diff --git a/cmd/run.go b/cmd/run.go index e529c902..dd1647ac 100644 --- a/cmd/run.go +++ b/cmd/run.go @@ -48,6 +48,13 @@ func NewRunCommand(factory *factory.ProviderFactory, scenarioOrchestrator *scena }, PreRunE: func(cmd *cobra.Command, args []string) error { + // dry-run=client skips all registry/cluster calls + if _, isDryRun, err := parseDryRunFlag(args); err != nil { + return err + } else if isDryRun { + return nil + } + registrySettings, err := models.NewRegistryV2FromEnv(config) if err != nil { return err @@ -122,6 +129,45 @@ func NewRunCommand(factory *factory.ProviderFactory, scenarioOrchestrator *scena return nil }, RunE: func(cmd *cobra.Command, args []string) error { + // ── dry-run=client: local validation only, no cluster/runtime calls ── + if _, isDryRun, err := parseDryRunFlag(args); err != nil { + return err + } else if isDryRun { + scenarioName, err := parseScenarioName(args) + if err != nil { + return err + } + + // Resolve registry settings from env or args (no cluster calls). + registrySettings, _ := models.NewRegistryV2FromEnv(config) + if registrySettings == nil { + registrySettings, _ = parsePrivateRepoArgs(cmd, &args) + } + + // Fetch scenario metadata from the image registry. + // This is the only network call in dry-run mode and it targets + // the image registry (Quay / private), NOT the Kubernetes cluster. + // kubeconfig is never loaded and no cluster API is contacted. + provider := GetProvider(registrySettings != nil, factory) + scenarioDetail, fetchErr := provider.GetScenarioDetail(scenarioName, registrySettings) + if fetchErr != nil { + // Registry unreachable — report clearly and exit non-zero. + return fmt.Errorf("dry-run: could not fetch scenario metadata for %q: %w", scenarioName, fetchErr) + } + + var globalDetail *models.ScenarioDetail + if scenarioDetail != nil { + globalDetail, _ = provider.GetGlobalEnvironment(registrySettings, scenarioName) + } + + result := validateScenarioLocally(scenarioDetail, globalDetail, args) + result.Print() + if !result.Valid { + return fmt.Errorf("dry-run validation failed") + } + return nil + } + registrySettings, err := models.NewRegistryV2FromEnv(config) if err != nil { return err diff --git a/pkg/randomstate/state.go b/pkg/randomstate/state.go new file mode 100644 index 00000000..129166bf --- /dev/null +++ b/pkg/randomstate/state.go @@ -0,0 +1,69 @@ +// Package randomstate manages the lifecycle state of a random chaos run. +// State is persisted as a JSON file so that status/abort commands can +// inspect or stop a running plan across process boundaries. +package randomstate + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "time" +) + +const stateDirName = ".krknctl" +const stateFileName = "random_run.json" + +// State holds the runtime information of an active random chaos run. +type State struct { + PID int `json:"pid"` + ScenarioName string `json:"scenario_name"` // human-readable basename of the plan file + PlanFile string `json:"plan_file"` // full path to the plan file + StartTime time.Time `json:"start_time"` + LogDir string `json:"log_dir,omitempty"` +} + +// statePath returns the OS-appropriate path for the state file, +// using os.TempDir() so it works on Linux, macOS, and Windows. +func statePath() string { + return filepath.Join(os.TempDir(), stateDirName, stateFileName) +} + +// SaveState persists s to disk, creating the state directory if needed. +func SaveState(s *State) error { + dir := filepath.Dir(statePath()) + if err := os.MkdirAll(dir, 0700); err != nil { + return fmt.Errorf("creating state dir: %w", err) + } + data, err := json.MarshalIndent(s, "", " ") + if err != nil { + return fmt.Errorf("marshalling state: %w", err) + } + return os.WriteFile(statePath(), data, 0600) +} + +// LoadState reads the persisted state from disk. +// Returns (nil, nil) when no state file exists — nothing is running. +func LoadState() (*State, error) { + data, err := os.ReadFile(statePath()) // #nosec G304 -- path is constructed from os.TempDir() + fixed constants + if os.IsNotExist(err) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("reading state file: %w", err) + } + var s State + if err = json.Unmarshal(data, &s); err != nil { + return nil, fmt.Errorf("parsing state file: %w", err) + } + return &s, nil +} + +// ClearState removes the state file from disk. Safe to call when no state exists. +func ClearState() error { + err := os.Remove(statePath()) + if os.IsNotExist(err) { + return nil + } + return err +} diff --git a/pkg/scenarioorchestrator/common_functions.go b/pkg/scenarioorchestrator/common_functions.go index 786d6b5d..be687ab8 100644 --- a/pkg/scenarioorchestrator/common_functions.go +++ b/pkg/scenarioorchestrator/common_functions.go @@ -28,6 +28,7 @@ func CommonRunGraph( config config.Config, registry *providermodels.RegistryV2, userID *int, + logDir string, ) { for step, s := range resolvedGraph { var wg sync.WaitGroup @@ -65,21 +66,27 @@ func CommonRunGraph( } containerName := utils.GenerateContainerName(config, scenario.Name, &scID) - filename := fmt.Sprintf("%s.log", containerName) - file, err := os.Create(path.Clean(filename)) - + logFilename := fmt.Sprintf("%s.log", containerName) + // honour custom log directory when provided + var logFilePath string + if logDir != "" { + logFilePath = path.Join(logDir, logFilename) + } else { + logFilePath = logFilename + } + file, err := os.Create(path.Clean(logFilePath)) if err != nil { commChannel <- &models.GraphCommChannel{Layer: nil, ScenarioID: nil, ScenarioLogFile: nil, Err: err} return } - commChannel <- &models.GraphCommChannel{Layer: &step, ScenarioID: &scID, ScenarioLogFile: &filename, Err: nil} + commChannel <- &models.GraphCommChannel{Layer: &step, ScenarioID: &scID, ScenarioLogFile: &logFilePath, Err: nil} wg.Add(1) go func() { defer wg.Done() _, err = orchestrator.RunAttached(scenario.Image, containerName, env, cache, volumes, file, file, nil, ctx, registry) if err != nil { - commChannel <- &models.GraphCommChannel{Layer: &step, ScenarioID: &scID, ScenarioLogFile: &filename, Err: err} + commChannel <- &models.GraphCommChannel{Layer: &step, ScenarioID: &scID, ScenarioLogFile: &logFilePath, Err: err} return } }() diff --git a/pkg/scenarioorchestrator/docker/scenario_orchestrator.go b/pkg/scenarioorchestrator/docker/scenario_orchestrator.go index f81fbbcf..94c41408 100644 --- a/pkg/scenarioorchestrator/docker/scenario_orchestrator.go +++ b/pkg/scenarioorchestrator/docker/scenario_orchestrator.go @@ -466,8 +466,9 @@ func (c *ScenarioOrchestrator) RunGraph( commChannel chan *orchestratormodels.GraphCommChannel, registry *providermodels.RegistryV2, userID *int, + logDir string, ) { - scenarioorchestrator.CommonRunGraph(scenarios, resolvedGraph, extraEnv, extraVolumeMounts, cache, commChannel, c, c.Config, registry, userID) + scenarioorchestrator.CommonRunGraph(scenarios, resolvedGraph, extraEnv, extraVolumeMounts, cache, commChannel, c, c.Config, registry, userID, logDir) } func (c *ScenarioOrchestrator) PrintContainerRuntime() { diff --git a/pkg/scenarioorchestrator/podman/scenario_orchestrator.go b/pkg/scenarioorchestrator/podman/scenario_orchestrator.go index 9d6bf1b0..61a25278 100644 --- a/pkg/scenarioorchestrator/podman/scenario_orchestrator.go +++ b/pkg/scenarioorchestrator/podman/scenario_orchestrator.go @@ -373,9 +373,9 @@ func (c *ScenarioOrchestrator) RunGraph( commChannel chan *orchestratormodels.GraphCommChannel, registry *providermodels.RegistryV2, userID *int, + logDir string, ) { - //TODO: add a getconfig method in scenarioOrchestrator - scenarioorchestrator.CommonRunGraph(scenarios, resolvedGraph, extraEnv, extraVolumeMounts, cache, commChannel, c, c.Config, registry, userID) + scenarioorchestrator.CommonRunGraph(scenarios, resolvedGraph, extraEnv, extraVolumeMounts, cache, commChannel, c, c.Config, registry, userID, logDir) } func (c *ScenarioOrchestrator) PrintContainerRuntime() { diff --git a/pkg/scenarioorchestrator/scenario_orchestrator.go b/pkg/scenarioorchestrator/scenario_orchestrator.go index 6d9805ae..989d5fb3 100644 --- a/pkg/scenarioorchestrator/scenario_orchestrator.go +++ b/pkg/scenarioorchestrator/scenario_orchestrator.go @@ -46,6 +46,7 @@ type ScenarioOrchestrator interface { commChannel chan *orchestrator_models.GraphCommChannel, registry *models.RegistryV2, userID *int, + logDir string, ) CleanContainers(ctx context.Context) (*int, error) diff --git a/pkg/scenarioorchestrator/scenarioorchestratortest/common_test_functions.go b/pkg/scenarioorchestrator/scenarioorchestratortest/common_test_functions.go index 5787800b..18f52603 100644 --- a/pkg/scenarioorchestrator/scenarioorchestratortest/common_test_functions.go +++ b/pkg/scenarioorchestrator/scenarioorchestratortest/common_test_functions.go @@ -336,7 +336,7 @@ func CommonTestScenarioOrchestratorRunGraph(t *testing.T, so scenarioorchestrato commChannel := make(chan *models.GraphCommChannel) go func() { - so.RunGraph(nodes, executionPlan, map[string]string{}, map[string]string{}, false, commChannel, nil, uid) + so.RunGraph(nodes, executionPlan, map[string]string{}, map[string]string{}, false, commChannel, nil, uid, "") }() for { @@ -412,7 +412,7 @@ func CommonTestScenarioOrchestratorRunGraph(t *testing.T, so scenarioorchestrato commChannel = make(chan *models.GraphCommChannel) go func() { - so.RunGraph(nodes, executionPlan, map[string]string{}, map[string]string{}, false, commChannel, nil, uid) + so.RunGraph(nodes, executionPlan, map[string]string{}, map[string]string{}, false, commChannel, nil, uid, "") }() for {