Skip to content

Commit 5d32089

Browse files
craig[bot]DarrylWong
andcommitted
Merge #142054
142054: roachprod/failureinjection: add artificial latency failure r=golgeek a=DarrylWong This change adds a new artificial latency failure mode. This failure mode uses TC to create traffic filters that can be used to add latency between nodes. Informs: #138970 Release note: none Co-authored-by: DarrylWong <[email protected]>
2 parents be833f2 + 70fdb98 commit 5d32089

File tree

11 files changed

+492
-61
lines changed

11 files changed

+492
-61
lines changed

pkg/cmd/roachtest/cluster.go

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2507,7 +2507,7 @@ func (c *clusterImpl) RunE(ctx context.Context, options install.RunOptions, args
25072507
return errors.New("No command passed")
25082508
}
25092509
nodes := option.FromInstallNodes(options.Nodes)
2510-
l, logFile, err := c.loggerForCmd(nodes, args...)
2510+
l, logFile, err := roachtestutil.LoggerForCmd(c.l, nodes, args...)
25112511
if err != nil {
25122512
return err
25132513
}
@@ -2575,7 +2575,7 @@ func (c *clusterImpl) RunWithDetails(
25752575
return nil, errors.New("No command passed")
25762576
}
25772577
nodes := option.FromInstallNodes(options.Nodes)
2578-
l, logFile, err := c.loggerForCmd(nodes, args...)
2578+
l, logFile, err := roachtestutil.LoggerForCmd(c.l, nodes, args...)
25792579
if err != nil {
25802580
return nil, err
25812581
}
@@ -2659,30 +2659,6 @@ func (c *clusterImpl) Install(
26592659
return errors.Wrap(roachprod.Install(ctx, l, c.MakeNodes(nodes), software), "cluster.Install")
26602660
}
26612661

2662-
// cmdLogFileName comes up with a log file to use for the given argument string.
2663-
func cmdLogFileName(t time.Time, nodes option.NodeListOption, args ...string) string {
2664-
logFile := fmt.Sprintf(
2665-
"run_%s_n%s_%s",
2666-
t.Format(`150405.000000000`),
2667-
nodes.String()[1:],
2668-
install.GenFilenameFromArgs(20, args...),
2669-
)
2670-
return logFile
2671-
}
2672-
2673-
func (c *clusterImpl) loggerForCmd(
2674-
node option.NodeListOption, args ...string,
2675-
) (*logger.Logger, string, error) {
2676-
logFile := cmdLogFileName(timeutil.Now(), node, args...)
2677-
2678-
// NB: we set no prefix because it's only going to a file anyway.
2679-
l, err := c.l.ChildLogger(logFile, logger.QuietStderr, logger.QuietStdout)
2680-
if err != nil {
2681-
return nil, "", err
2682-
}
2683-
return l, logFile, nil
2684-
}
2685-
26862662
// pgURLErr returns the Postgres endpoint for the specified nodes. It accepts a
26872663
// flag specifying whether the URL should include the node's internal or
26882664
// external IP address. In general, inter-cluster communication and should use

pkg/cmd/roachtest/cluster_test.go

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111
"strconv"
1212
"strings"
1313
"testing"
14-
"time"
1514

1615
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1716
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/task"
@@ -24,7 +23,6 @@ import (
2423
"github.com/cockroachdb/cockroach/pkg/util/version"
2524
"github.com/cockroachdb/datadriven"
2625
"github.com/cockroachdb/errors"
27-
"github.com/stretchr/testify/assert"
2826
"github.com/stretchr/testify/require"
2927
)
3028

@@ -686,21 +684,6 @@ func TestMachineTypes(t *testing.T) {
686684
})
687685
}
688686

689-
func TestCmdLogFileName(t *testing.T) {
690-
ts := time.Date(2000, 1, 1, 15, 4, 12, 0, time.Local)
691-
692-
const exp = `run_150412.000000000_n1,3-4,9_cockroach-bla-foo-ba`
693-
nodes := option.NodeListOption{1, 3, 4, 9}
694-
assert.Equal(t,
695-
exp,
696-
cmdLogFileName(ts, nodes, "./cockroach", "bla", "--foo", "bar"),
697-
)
698-
assert.Equal(t,
699-
exp,
700-
cmdLogFileName(ts, nodes, "./cockroach bla --foo bar"),
701-
)
702-
}
703-
704687
func TestVerifyLibraries(t *testing.T) {
705688
originalLibraryPaths := libraryFilePaths
706689
defer func() { libraryFilePaths = originalLibraryPaths }()

pkg/cmd/roachtest/roachtestutil/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,12 @@ go_test(
5353
srcs = [
5454
"commandbuilder_test.go",
5555
"load_group_test.go",
56+
"utils_test.go",
5657
],
5758
embed = [":roachtestutil"],
5859
deps = [
5960
"//pkg/cmd/roachtest/option",
61+
"@com_github_stretchr_testify//assert",
6062
"@com_github_stretchr_testify//require",
6163
],
6264
)

pkg/cmd/roachtest/roachtestutil/utils.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"net/http"
1414
"os"
1515
"regexp"
16+
"strconv"
1617
"strings"
1718
"time"
1819

@@ -218,6 +219,31 @@ func IfLocal(c cluster.Cluster, trueVal, falseVal string) string {
218219
return falseVal
219220
}
220221

222+
// LoggerForCmd creates a logger to a file with quiet stdout and stderr.
223+
func LoggerForCmd(
224+
l *logger.Logger, node option.NodeListOption, args ...string,
225+
) (*logger.Logger, string, error) {
226+
logFile := cmdLogFileName(timeutil.Now(), node, args...)
227+
228+
// NB: we set no prefix because it's only going to a file anyway.
229+
l, err := l.ChildLogger(logFile, logger.QuietStderr, logger.QuietStdout)
230+
if err != nil {
231+
return nil, "", err
232+
}
233+
return l, logFile, nil
234+
}
235+
236+
// cmdLogFileName comes up with a log file to use for the given argument string.
237+
func cmdLogFileName(t time.Time, nodes option.NodeListOption, args ...string) string {
238+
logFile := fmt.Sprintf(
239+
"run_%s_n%s_%s",
240+
t.Format(`150405.000000000`),
241+
nodes.String()[1:],
242+
install.GenFilenameFromArgs(20, args...),
243+
)
244+
return logFile
245+
}
246+
221247
// CheckPortBlocked returns true if a connection from a node to a port on another node
222248
// can be established. Requires nmap to be installed.
223249
func CheckPortBlocked(
@@ -237,3 +263,19 @@ func CheckPortBlocked(
237263
}
238264
return strings.Contains(res.Stdout, "filtered"), nil
239265
}
266+
267+
// PortLatency returns the latency from one node to another port.
268+
// Requires nmap to be installed.
269+
func PortLatency(
270+
ctx context.Context, l *logger.Logger, c cluster.Cluster, fromNode, toNode option.NodeListOption,
271+
) (time.Duration, error) {
272+
res, err := c.RunWithDetailsSingleNode(ctx, l, option.WithNodes(fromNode), fmt.Sprintf("nmap -p {pgport%[1]s} -Pn {ip%[1]s} -oG - | grep 'scanned in' | awk '{print $(NF-1)}'", toNode))
273+
if err != nil {
274+
return 0, err
275+
}
276+
avgRTT, err := strconv.ParseFloat(strings.TrimSpace(res.Stdout), 64)
277+
if err != nil {
278+
return 0, err
279+
}
280+
return time.Duration(avgRTT * float64(time.Second)), nil
281+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package roachtestutil
7+
8+
import (
9+
"testing"
10+
"time"
11+
12+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
13+
"github.com/stretchr/testify/assert"
14+
)
15+
16+
func TestCmdLogFileName(t *testing.T) {
17+
ts := time.Date(2000, 1, 1, 15, 4, 12, 0, time.Local)
18+
19+
const exp = `run_150412.000000000_n1,3-4,9_cockroach-bla-foo-ba`
20+
nodes := option.NodeListOption{1, 3, 4, 9}
21+
assert.Equal(t,
22+
exp,
23+
cmdLogFileName(ts, nodes, "./cockroach", "bla", "--foo", "bar"),
24+
)
25+
assert.Equal(t,
26+
exp,
27+
cmdLogFileName(ts, nodes, "./cockroach bla --foo bar"),
28+
)
29+
}

pkg/cmd/roachtest/tests/failure_injection.go

Lines changed: 113 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"context"
1010
"fmt"
1111
"math/rand"
12+
"time"
1213

1314
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1415
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
@@ -35,45 +36,79 @@ type failureSmokeTest struct {
3536

3637
func (t *failureSmokeTest) run(
3738
ctx context.Context, l *logger.Logger, c cluster.Cluster, fr *failures.FailureRegistry,
38-
) error {
39+
) (err error) {
3940
// TODO(darryl): In the future, roachtests should interact with the failure injection library
4041
// through helper functions in roachtestutil so they don't have to interface with roachprod
4142
// directly.
42-
failureMode, err := fr.GetFailureMode(c.MakeNodes(), t.failureName, l, c.IsSecure())
43+
failureMode, err := fr.GetFailureMode(c.MakeNodes(c.CRDBNodes()), t.failureName, l, c.IsSecure())
4344
if err != nil {
4445
return err
4546
}
46-
if err = failureMode.Setup(ctx, l, t.args); err != nil {
47+
// Make sure to cleanup the failure mode even if the test fails.
48+
defer func() {
49+
quietLogger, file, logErr := roachtestutil.LoggerForCmd(l, c.CRDBNodes(), t.testName, "cleanup")
50+
if logErr != nil {
51+
l.Printf("failed to create logger for cleanup: %v", logErr)
52+
quietLogger = l
53+
}
54+
l.Printf("%s: Running Cleanup(); details in %s.log", t.failureName, file)
55+
err = errors.CombineErrors(err, failureMode.Cleanup(ctx, quietLogger, t.args))
56+
}()
57+
58+
quietLogger, file, err := roachtestutil.LoggerForCmd(l, c.CRDBNodes(), t.testName, "setup")
59+
if err != nil {
60+
return err
61+
}
62+
l.Printf("%s: Running Setup(); details in %s.log", t.failureName, file)
63+
if err = failureMode.Setup(ctx, quietLogger, t.args); err != nil {
64+
return err
65+
}
66+
67+
quietLogger, file, err = roachtestutil.LoggerForCmd(l, c.CRDBNodes(), t.testName, "inject")
68+
if err != nil {
4769
return err
4870
}
49-
if err = failureMode.Inject(ctx, l, t.args); err != nil {
71+
l.Printf("%s: Running Inject(); details in %s.log", t.failureName, file)
72+
if err = failureMode.Inject(ctx, quietLogger, t.args); err != nil {
5073
return err
5174
}
5275

5376
// Allow the failure to take effect.
54-
if err = failureMode.WaitForFailureToPropagate(ctx, l, t.args); err != nil {
77+
quietLogger, file, err = roachtestutil.LoggerForCmd(l, c.CRDBNodes(), t.testName, "wait for propagate")
78+
if err != nil {
79+
return err
80+
}
81+
l.Printf("%s: Running WaitForFailureToPropagate(); details in %s.log", t.failureName, file)
82+
if err = failureMode.WaitForFailureToPropagate(ctx, quietLogger, t.args); err != nil {
5583
return err
5684
}
5785

86+
l.Printf("validating failure was properly injected")
5887
if err = t.validateFailure(ctx, l, c); err != nil {
5988
return err
6089
}
61-
if err = failureMode.Restore(ctx, l, t.args); err != nil {
90+
91+
quietLogger, file, err = roachtestutil.LoggerForCmd(l, c.CRDBNodes(), t.testName, "restore")
92+
if err != nil {
6293
return err
6394
}
64-
65-
// Allow the cluster to return to normal.
66-
if err = failureMode.WaitForFailureToRestore(ctx, l, t.args); err != nil {
95+
l.Printf("%s: Running Restore(); details in %s.log", t.failureName, file)
96+
if err = failureMode.Restore(ctx, quietLogger, t.args); err != nil {
6797
return err
6898
}
6999

70-
if err = t.validateRestore(ctx, l, c); err != nil {
100+
// Allow the cluster to return to normal.
101+
quietLogger, file, err = roachtestutil.LoggerForCmd(l, c.CRDBNodes(), t.testName, "wait for restore")
102+
if err != nil {
71103
return err
72104
}
73-
if err = failureMode.Cleanup(ctx, l, t.args); err != nil {
105+
l.Printf("%s: Running WaitForFailureToRestore(); details in %s.log", t.failureName, file)
106+
if err = failureMode.WaitForFailureToRestore(ctx, quietLogger, t.args); err != nil {
74107
return err
75108
}
76-
return nil
109+
110+
l.Printf("validating failure was properly restored")
111+
return t.validateRestore(ctx, l, c)
77112
}
78113

79114
func (t *failureSmokeTest) noopRun(
@@ -233,6 +268,69 @@ var asymmetricOutgoingNetworkPartitionTest = func(c cluster.Cluster) failureSmok
233268
}
234269
}
235270

271+
var latencyTest = func(c cluster.Cluster) failureSmokeTest {
272+
nodes := c.CRDBNodes()
273+
rand.Shuffle(len(nodes), func(i, j int) {
274+
nodes[i], nodes[j] = nodes[j], nodes[i]
275+
})
276+
srcNode := nodes[0]
277+
destNode := nodes[1]
278+
unaffectedNode := nodes[2]
279+
return failureSmokeTest{
280+
testName: "Network Latency",
281+
failureName: failures.NetworkLatencyName,
282+
args: failures.NetworkLatencyArgs{
283+
ArtificialLatencies: []failures.ArtificialLatency{
284+
{
285+
Source: install.Nodes{install.Node(srcNode)},
286+
Destination: install.Nodes{install.Node(destNode)},
287+
Delay: 2 * time.Second,
288+
},
289+
{
290+
Source: install.Nodes{install.Node(destNode)},
291+
Destination: install.Nodes{install.Node(srcNode)},
292+
Delay: 2 * time.Second,
293+
},
294+
},
295+
},
296+
validateFailure: func(ctx context.Context, l *logger.Logger, c cluster.Cluster) error {
297+
// Note that this is one way latency, since the sender doesn't have the matching port.
298+
delayedLatency, err := roachtestutil.PortLatency(ctx, l, c, c.Nodes(srcNode), c.Nodes(destNode))
299+
if err != nil {
300+
return err
301+
}
302+
normalLatency, err := roachtestutil.PortLatency(ctx, l, c, c.Nodes(unaffectedNode), c.Nodes(destNode))
303+
if err != nil {
304+
return err
305+
}
306+
if delayedLatency < normalLatency*2 {
307+
return errors.Errorf("expected latency between nodes with artificial latency (n%d and n%d) to be much higher than between nodes without (n%d and n%d)", srcNode, destNode, unaffectedNode, destNode)
308+
}
309+
if delayedLatency < time.Second || delayedLatency > 3*time.Second {
310+
return errors.Errorf("expected latency between nodes with artificial latency (n%d and n%d) to be at least within 1s and 3s", srcNode, destNode)
311+
}
312+
return nil
313+
},
314+
validateRestore: func(ctx context.Context, l *logger.Logger, c cluster.Cluster) error {
315+
delayedLatency, err := roachtestutil.PortLatency(ctx, l, c, c.Nodes(srcNode), c.Nodes(destNode))
316+
if err != nil {
317+
return err
318+
}
319+
normalLatency, err := roachtestutil.PortLatency(ctx, l, c, c.Nodes(unaffectedNode), c.Nodes(destNode))
320+
if err != nil {
321+
return err
322+
}
323+
if delayedLatency > 2*normalLatency {
324+
return errors.Errorf("expected latency between nodes with artificial latency (n%d and n%d) to be close to latency between nodes without (n%d and n%d)", srcNode, destNode, unaffectedNode, destNode)
325+
}
326+
if delayedLatency > 500*time.Millisecond {
327+
return errors.Errorf("expected latency between nodes with artificial latency (n%d and n%d) to have restored to at least less than 500ms", srcNode, destNode)
328+
}
329+
return nil
330+
},
331+
}
332+
}
333+
236334
func setupFailureSmokeTests(ctx context.Context, t test.Test, c cluster.Cluster) error {
237335
// Download any dependencies needed.
238336
if err := c.Install(ctx, t.L(), c.CRDBNodes(), "nmap"); err != nil {
@@ -258,6 +356,7 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
258356
bidirectionalNetworkPartitionTest(c),
259357
asymmetricIncomingNetworkPartitionTest(c),
260358
asymmetricOutgoingNetworkPartitionTest(c),
359+
latencyTest(c),
261360
}
262361

263362
// Randomize the order of the tests in case any of the failures have unexpected side
@@ -284,7 +383,7 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
284383

285384
func registerFISmokeTest(r registry.Registry) {
286385
r.Add(registry.TestSpec{
287-
Name: "failure-injection-smoke-test",
386+
Name: "failure-injection/smoke-test",
288387
Owner: registry.OwnerTestEng,
289388
Cluster: r.MakeClusterSpec(4, spec.WorkloadNode(), spec.CPU(2), spec.WorkloadNodeCPU(2), spec.ReuseNone()),
290389
CompatibleClouds: registry.OnlyGCE,
@@ -295,7 +394,7 @@ func registerFISmokeTest(r registry.Registry) {
295394
},
296395
})
297396
r.Add(registry.TestSpec{
298-
Name: "failure-injection-noop-smoke-test",
397+
Name: "failure-injection/smoke-test/noop",
299398
Owner: registry.OwnerTestEng,
300399
Cluster: r.MakeClusterSpec(4, spec.WorkloadNode(), spec.CPU(2), spec.WorkloadNodeCPU(2), spec.ReuseNone()),
301400
CompatibleClouds: registry.OnlyGCE,

0 commit comments

Comments
 (0)