99 "context"
1010 "fmt"
1111 "math/rand"
12+ "time"
1213
1314 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1415 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
@@ -35,45 +36,79 @@ type failureSmokeTest struct {
3536
3637func (t * failureSmokeTest ) run (
3738 ctx context.Context , l * logger.Logger , c cluster.Cluster , fr * failures.FailureRegistry ,
38- ) error {
39+ ) ( err error ) {
3940 // TODO(darryl): In the future, roachtests should interact with the failure injection library
4041 // through helper functions in roachtestutil so they don't have to interface with roachprod
4142 // directly.
42- failureMode , err := fr .GetFailureMode (c .MakeNodes (), t .failureName , l , c .IsSecure ())
43+ failureMode , err := fr .GetFailureMode (c .MakeNodes (c . CRDBNodes () ), t .failureName , l , c .IsSecure ())
4344 if err != nil {
4445 return err
4546 }
46- if err = failureMode .Setup (ctx , l , t .args ); err != nil {
47+ // Make sure to cleanup the failure mode even if the test fails.
48+ defer func () {
49+ quietLogger , file , logErr := roachtestutil .LoggerForCmd (l , c .CRDBNodes (), t .testName , "cleanup" )
50+ if logErr != nil {
51+ l .Printf ("failed to create logger for cleanup: %v" , logErr )
52+ quietLogger = l
53+ }
54+ l .Printf ("%s: Running Cleanup(); details in %s.log" , t .failureName , file )
55+ err = errors .CombineErrors (err , failureMode .Cleanup (ctx , quietLogger , t .args ))
56+ }()
57+
58+ quietLogger , file , err := roachtestutil .LoggerForCmd (l , c .CRDBNodes (), t .testName , "setup" )
59+ if err != nil {
60+ return err
61+ }
62+ l .Printf ("%s: Running Setup(); details in %s.log" , t .failureName , file )
63+ if err = failureMode .Setup (ctx , quietLogger , t .args ); err != nil {
64+ return err
65+ }
66+
67+ quietLogger , file , err = roachtestutil .LoggerForCmd (l , c .CRDBNodes (), t .testName , "inject" )
68+ if err != nil {
4769 return err
4870 }
49- if err = failureMode .Inject (ctx , l , t .args ); err != nil {
71+ l .Printf ("%s: Running Inject(); details in %s.log" , t .failureName , file )
72+ if err = failureMode .Inject (ctx , quietLogger , t .args ); err != nil {
5073 return err
5174 }
5275
5376 // Allow the failure to take effect.
54- if err = failureMode .WaitForFailureToPropagate (ctx , l , t .args ); err != nil {
77+ quietLogger , file , err = roachtestutil .LoggerForCmd (l , c .CRDBNodes (), t .testName , "wait for propagate" )
78+ if err != nil {
79+ return err
80+ }
81+ l .Printf ("%s: Running WaitForFailureToPropagate(); details in %s.log" , t .failureName , file )
82+ if err = failureMode .WaitForFailureToPropagate (ctx , quietLogger , t .args ); err != nil {
5583 return err
5684 }
5785
86+ l .Printf ("validating failure was properly injected" )
5887 if err = t .validateFailure (ctx , l , c ); err != nil {
5988 return err
6089 }
61- if err = failureMode .Restore (ctx , l , t .args ); err != nil {
90+
91+ quietLogger , file , err = roachtestutil .LoggerForCmd (l , c .CRDBNodes (), t .testName , "restore" )
92+ if err != nil {
6293 return err
6394 }
64-
65- // Allow the cluster to return to normal.
66- if err = failureMode .WaitForFailureToRestore (ctx , l , t .args ); err != nil {
95+ l .Printf ("%s: Running Restore(); details in %s.log" , t .failureName , file )
96+ if err = failureMode .Restore (ctx , quietLogger , t .args ); err != nil {
6797 return err
6898 }
6999
70- if err = t .validateRestore (ctx , l , c ); err != nil {
100+ // Allow the cluster to return to normal.
101+ quietLogger , file , err = roachtestutil .LoggerForCmd (l , c .CRDBNodes (), t .testName , "wait for restore" )
102+ if err != nil {
71103 return err
72104 }
73- if err = failureMode .Cleanup (ctx , l , t .args ); err != nil {
105+ l .Printf ("%s: Running WaitForFailureToRestore(); details in %s.log" , t .failureName , file )
106+ if err = failureMode .WaitForFailureToRestore (ctx , quietLogger , t .args ); err != nil {
74107 return err
75108 }
76- return nil
109+
110+ l .Printf ("validating failure was properly restored" )
111+ return t .validateRestore (ctx , l , c )
77112}
78113
79114func (t * failureSmokeTest ) noopRun (
@@ -233,6 +268,69 @@ var asymmetricOutgoingNetworkPartitionTest = func(c cluster.Cluster) failureSmok
233268 }
234269}
235270
271+ var latencyTest = func (c cluster.Cluster ) failureSmokeTest {
272+ nodes := c .CRDBNodes ()
273+ rand .Shuffle (len (nodes ), func (i , j int ) {
274+ nodes [i ], nodes [j ] = nodes [j ], nodes [i ]
275+ })
276+ srcNode := nodes [0 ]
277+ destNode := nodes [1 ]
278+ unaffectedNode := nodes [2 ]
279+ return failureSmokeTest {
280+ testName : "Network Latency" ,
281+ failureName : failures .NetworkLatencyName ,
282+ args : failures.NetworkLatencyArgs {
283+ ArtificialLatencies : []failures.ArtificialLatency {
284+ {
285+ Source : install.Nodes {install .Node (srcNode )},
286+ Destination : install.Nodes {install .Node (destNode )},
287+ Delay : 2 * time .Second ,
288+ },
289+ {
290+ Source : install.Nodes {install .Node (destNode )},
291+ Destination : install.Nodes {install .Node (srcNode )},
292+ Delay : 2 * time .Second ,
293+ },
294+ },
295+ },
296+ validateFailure : func (ctx context.Context , l * logger.Logger , c cluster.Cluster ) error {
297+ // Note that this is one way latency, since the sender doesn't have the matching port.
298+ delayedLatency , err := roachtestutil .PortLatency (ctx , l , c , c .Nodes (srcNode ), c .Nodes (destNode ))
299+ if err != nil {
300+ return err
301+ }
302+ normalLatency , err := roachtestutil .PortLatency (ctx , l , c , c .Nodes (unaffectedNode ), c .Nodes (destNode ))
303+ if err != nil {
304+ return err
305+ }
306+ if delayedLatency < normalLatency * 2 {
307+ return errors .Errorf ("expected latency between nodes with artificial latency (n%d and n%d) to be much higher than between nodes without (n%d and n%d)" , srcNode , destNode , unaffectedNode , destNode )
308+ }
309+ if delayedLatency < time .Second || delayedLatency > 3 * time .Second {
310+ return errors .Errorf ("expected latency between nodes with artificial latency (n%d and n%d) to be at least within 1s and 3s" , srcNode , destNode )
311+ }
312+ return nil
313+ },
314+ validateRestore : func (ctx context.Context , l * logger.Logger , c cluster.Cluster ) error {
315+ delayedLatency , err := roachtestutil .PortLatency (ctx , l , c , c .Nodes (srcNode ), c .Nodes (destNode ))
316+ if err != nil {
317+ return err
318+ }
319+ normalLatency , err := roachtestutil .PortLatency (ctx , l , c , c .Nodes (unaffectedNode ), c .Nodes (destNode ))
320+ if err != nil {
321+ return err
322+ }
323+ if delayedLatency > 2 * normalLatency {
324+ return errors .Errorf ("expected latency between nodes with artificial latency (n%d and n%d) to be close to latency between nodes without (n%d and n%d)" , srcNode , destNode , unaffectedNode , destNode )
325+ }
326+ if delayedLatency > 500 * time .Millisecond {
327+ return errors .Errorf ("expected latency between nodes with artificial latency (n%d and n%d) to have restored to at least less than 500ms" , srcNode , destNode )
328+ }
329+ return nil
330+ },
331+ }
332+ }
333+
236334func setupFailureSmokeTests (ctx context.Context , t test.Test , c cluster.Cluster ) error {
237335 // Download any dependencies needed.
238336 if err := c .Install (ctx , t .L (), c .CRDBNodes (), "nmap" ); err != nil {
@@ -258,6 +356,7 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
258356 bidirectionalNetworkPartitionTest (c ),
259357 asymmetricIncomingNetworkPartitionTest (c ),
260358 asymmetricOutgoingNetworkPartitionTest (c ),
359+ latencyTest (c ),
261360 }
262361
263362 // Randomize the order of the tests in case any of the failures have unexpected side
@@ -284,7 +383,7 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
284383
285384func registerFISmokeTest (r registry.Registry ) {
286385 r .Add (registry.TestSpec {
287- Name : "failure-injection- smoke-test" ,
386+ Name : "failure-injection/ smoke-test" ,
288387 Owner : registry .OwnerTestEng ,
289388 Cluster : r .MakeClusterSpec (4 , spec .WorkloadNode (), spec .CPU (2 ), spec .WorkloadNodeCPU (2 ), spec .ReuseNone ()),
290389 CompatibleClouds : registry .OnlyGCE ,
@@ -295,7 +394,7 @@ func registerFISmokeTest(r registry.Registry) {
295394 },
296395 })
297396 r .Add (registry.TestSpec {
298- Name : "failure-injection-noop- smoke-test" ,
397+ Name : "failure-injection/ smoke-test/noop " ,
299398 Owner : registry .OwnerTestEng ,
300399 Cluster : r .MakeClusterSpec (4 , spec .WorkloadNode (), spec .CPU (2 ), spec .WorkloadNodeCPU (2 ), spec .ReuseNone ()),
301400 CompatibleClouds : registry .OnlyGCE ,
0 commit comments