9
9
"context"
10
10
"fmt"
11
11
"math/rand"
12
+ "os"
13
+ "regexp"
12
14
"strings"
13
15
"time"
14
16
@@ -23,6 +25,7 @@ import (
23
25
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
24
26
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
25
27
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
28
+ "github.com/cockroachdb/cockroach/pkg/testutils"
26
29
"github.com/cockroachdb/cockroach/pkg/util/randutil"
27
30
"github.com/cockroachdb/errors"
28
31
)
@@ -52,6 +55,8 @@ type failureSmokeTest struct {
52
55
validateRecover func (ctx context.Context , l * logger.Logger , c cluster.Cluster , f * failures.Failer ) error
53
56
// The workload to be run during the failureSmokeTest, if nil, defaultSmokeTestWorkload is used.
54
57
workload func (ctx context.Context , c cluster.Cluster , args ... string ) error
58
+ // The duration to run the workload for before injecting the failure.
59
+ workloadRamp time.Duration
55
60
}
56
61
57
62
func (t * failureSmokeTest ) run (
@@ -82,6 +87,15 @@ func (t *failureSmokeTest) run(
82
87
return err
83
88
}
84
89
90
+ if t .workloadRamp > 0 {
91
+ l .Printf ("sleeping for %s before injecting failure" , t .workloadRamp )
92
+ select {
93
+ case <- ctx .Done ():
94
+ return ctx .Err ()
95
+ case <- time .After (t .workloadRamp ):
96
+ }
97
+ }
98
+
85
99
quietLogger , file , err = roachtestutil .LoggerForCmd (l , c .CRDBNodes (), t .testName , "inject" )
86
100
if err != nil {
87
101
return err
@@ -596,6 +610,122 @@ var dmsetupDiskStallTest = func(c cluster.Cluster) failureSmokeTest {
596
610
}
597
611
}
598
612
613
+ var processKillTests = func (c cluster.Cluster ) []failureSmokeTest {
614
+ rng , _ := randutil .NewPseudoRand ()
615
+ var tests []failureSmokeTest
616
+ for _ , gracefulShutdown := range []bool {true , false } {
617
+ groups , _ := c .CRDBNodes ().SeededRandGroups (rng , 2 /* numGroups */ )
618
+ killedNodeGroup := groups [0 ]
619
+ unaffectedNodeGroup := groups [1 ]
620
+
621
+ // These are the nodes that we will run validation on.
622
+ killedNode := killedNodeGroup .SeededRandNode (rng )
623
+ unaffectedNode := unaffectedNodeGroup .SeededRandNode (rng )
624
+
625
+ tests = append (tests , failureSmokeTest {
626
+ testName : fmt .Sprintf ("%s/GracefulShutdown=%t" , failures .ProcessKillFailureName , gracefulShutdown ),
627
+ failureName : failures .ProcessKillFailureName ,
628
+ args : failures.ProcessKillArgs {
629
+ Nodes : killedNodeGroup .InstallNodes (),
630
+ GracefulShutdown : gracefulShutdown ,
631
+ GracePeriod : time .Minute ,
632
+ },
633
+ validateFailure : func (ctx context.Context , l * logger.Logger , c cluster.Cluster , f * failures.Failer ) error {
634
+ // If we initiate a graceful shutdown, the cockroach process should
635
+ // intercept it and start draining the node.
636
+ if gracefulShutdown {
637
+ err := testutils .SucceedsSoonError (func () error {
638
+ if ctx .Err () != nil {
639
+ return ctx .Err ()
640
+ }
641
+ res , err := c .RunWithDetailsSingleNode (ctx , l , option .WithNodes (unaffectedNode ), fmt .Sprintf ("./cockroach node status %d --decommission --certs-dir=%s | sed -n '2p' | awk '{print $NF}'" , killedNode [0 ], install .CockroachNodeCertsDir ))
642
+ if err != nil {
643
+ return err
644
+ }
645
+ isDraining := strings .TrimSpace (res .Stdout )
646
+ if isDraining != "true" {
647
+ return errors .Errorf ("expected node %d to be draining" , killedNode [0 ])
648
+ }
649
+ return nil
650
+ })
651
+ if err != nil {
652
+ return err
653
+ }
654
+ }
655
+
656
+ // Check that we aren't able to establish a SQL connection to the killed node.
657
+ // waitForFailureToPropagate already checks system death for us, which is a
658
+ // stronger assertion than checking SQL connections are unavailable. We
659
+ // are mostly doing this to satisfy the smoke test framework since this is
660
+ // a fairly simple failure mode with less to validate.
661
+ err := testutils .SucceedsSoonError (func () error {
662
+ if ctx .Err () != nil {
663
+ return ctx .Err ()
664
+ }
665
+
666
+ killedDB , err := c .ConnE (ctx , l , killedNode [0 ])
667
+ if err == nil {
668
+ defer killedDB .Close ()
669
+ if err := killedDB .Ping (); err == nil {
670
+ return errors .Errorf ("expected node %d to be dead, but it is alive" , killedNode )
671
+ } else {
672
+ l .Printf ("failed to connect to node %d: %v" , killedNode , err )
673
+ }
674
+ } else {
675
+ l .Printf ("unable to establish SQL connection to node %d" , killedNode )
676
+ }
677
+ return nil
678
+ })
679
+
680
+ return err
681
+ },
682
+ // Similar to validateFailure, there is not much to validate here that isn't
683
+ // covered by WaitForFailureToRecover, so just skip it.
684
+ validateRecover : func (ctx context.Context , l * logger.Logger , c cluster.Cluster , f * failures.Failer ) error {
685
+ return nil
686
+ },
687
+ workload : func (ctx context.Context , c cluster.Cluster , args ... string ) error {
688
+ return defaultFailureSmokeTestWorkload (ctx , c , "--tolerate-errors" )
689
+ },
690
+ // Shutting down the server right after it's started can cause draining to be skipped.
691
+ workloadRamp : 30 * time .Second ,
692
+ })
693
+ }
694
+
695
+ groups , _ := c .CRDBNodes ().SeededRandGroups (rng , 2 /* numGroups */ )
696
+ killedNodeGroup := groups [0 ]
697
+ // This is the node that we will run validation on.
698
+ killedNode := killedNodeGroup .SeededRandNode (rng )
699
+ noopSignal := 0
700
+
701
+ // Test that the GracePeriod logic will kick in if the SIGTERM hangs.
702
+ tests = append (tests , failureSmokeTest {
703
+ testName : fmt .Sprintf ("%s/hanging-drain" , failures .ProcessKillFailureName ),
704
+ failureName : failures .ProcessKillFailureName ,
705
+ args : failures.ProcessKillArgs {
706
+ Nodes : killedNode .InstallNodes (),
707
+ Signal : & noopSignal ,
708
+ GracePeriod : 30 * time .Second ,
709
+ },
710
+ // There isn't anything to validate here because our failure is effectively
711
+ // a noop at first. Only after the GracePeriod will we see anything happen.
712
+ // We could block for 30 seconds and then check that the node is dead, but
713
+ // this is the same thing WaitForFailureToPropagate does for us.
714
+ validateFailure : func (ctx context.Context , l * logger.Logger , c cluster.Cluster , f * failures.Failer ) error {
715
+ return nil
716
+ },
717
+ validateRecover : func (ctx context.Context , l * logger.Logger , c cluster.Cluster , f * failures.Failer ) error {
718
+ return nil
719
+ },
720
+ workload : func (ctx context.Context , c cluster.Cluster , args ... string ) error {
721
+ return defaultFailureSmokeTestWorkload (ctx , c , "--tolerate-errors" )
722
+ },
723
+ // Shutting down the server right after it's started can cause draining to be skipped.
724
+ workloadRamp : 30 * time .Second ,
725
+ })
726
+ return tests
727
+ }
728
+
599
729
func defaultFailureSmokeTestWorkload (ctx context.Context , c cluster.Cluster , args ... string ) error {
600
730
workloadArgs := strings .Join (args , " " )
601
731
cmd := roachtestutil .NewCommand ("./cockroach workload run kv %s" , workloadArgs ).
@@ -644,6 +774,7 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
644
774
dmsetupDiskStallTest (c ),
645
775
}
646
776
failureSmokeTests = append (failureSmokeTests , cgroupsDiskStallTests (c )... )
777
+ failureSmokeTests = append (failureSmokeTests , processKillTests (c )... )
647
778
648
779
// Randomize the order of the tests in case any of the failures have unexpected side
649
780
// effects that may mask failures, e.g. a cgroups disk stall isn't properly recovered
@@ -652,6 +783,22 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
652
783
failureSmokeTests [i ], failureSmokeTests [j ] = failureSmokeTests [j ], failureSmokeTests [i ]
653
784
})
654
785
786
+ // For testing new failure modes, it may be useful to run only a subset of
787
+ // tests to increase iteration speed.
788
+ if regex := os .Getenv ("FAILURE_INJECTION_SMOKE_TEST_FILTER" ); regex != "" {
789
+ filter , err := regexp .Compile (regex )
790
+ if err != nil {
791
+ t .Fatal (err )
792
+ }
793
+ var filteredTests []failureSmokeTest
794
+ for _ , test := range failureSmokeTests {
795
+ if filter .MatchString (test .testName ) {
796
+ filteredTests = append (filteredTests , test )
797
+ }
798
+ }
799
+ failureSmokeTests = filteredTests
800
+ }
801
+
655
802
for _ , test := range failureSmokeTests {
656
803
t .L ().Printf ("\n =====running %s test=====" , test .testName )
657
804
if noopFailer {
0 commit comments