Skip to content

Commit 8656853

Browse files
committed
roachprod/failure-injection: add artificial latency failure
This change adds a new artificial latency failure mode. This failure mode uses TC to create traffic filters that can be used to add latency between nodes.
1 parent e452d47 commit 8656853

File tree

7 files changed

+395
-15
lines changed

7 files changed

+395
-15
lines changed

pkg/cmd/roachtest/roachtestutil/utils.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"net/http"
1414
"os"
1515
"regexp"
16+
"strconv"
1617
"strings"
1718
"time"
1819

@@ -237,3 +238,19 @@ func CheckPortBlocked(
237238
}
238239
return strings.Contains(res.Stdout, "filtered"), nil
239240
}
241+
242+
// PortLatency returns the latency from one node to another port.
243+
// Requires nmap to be installed.
244+
func PortLatency(
245+
ctx context.Context, l *logger.Logger, c cluster.Cluster, fromNode, toNode option.NodeListOption,
246+
) (time.Duration, error) {
247+
res, err := c.RunWithDetailsSingleNode(ctx, l, option.WithNodes(fromNode), fmt.Sprintf("nmap -p {pgport%[1]s} -Pn {ip%[1]s} -oG - | grep 'scanned in' | awk '{print $(NF-1)}'", toNode))
248+
if err != nil {
249+
return 0, err
250+
}
251+
avgRTT, err := strconv.ParseFloat(strings.TrimSpace(res.Stdout), 64)
252+
if err != nil {
253+
return 0, err
254+
}
255+
return time.Duration(avgRTT * float64(time.Second)), nil
256+
}

pkg/cmd/roachtest/tests/failure_injection.go

Lines changed: 74 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"context"
1010
"fmt"
1111
"math/rand"
12+
"time"
1213

1314
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1415
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
@@ -35,14 +36,18 @@ type failureSmokeTest struct {
3536

3637
func (t *failureSmokeTest) run(
3738
ctx context.Context, l *logger.Logger, c cluster.Cluster, fr *failures.FailureRegistry,
38-
) error {
39+
) (err error) {
3940
// TODO(darryl): In the future, roachtests should interact with the failure injection library
4041
// through helper functions in roachtestutil so they don't have to interface with roachprod
4142
// directly.
42-
failureMode, err := fr.GetFailureMode(c.MakeNodes(), t.failureName, l, c.IsSecure())
43+
failureMode, err := fr.GetFailureMode(c.MakeNodes(c.CRDBNodes()), t.failureName, l, c.IsSecure())
4344
if err != nil {
4445
return err
4546
}
47+
// Make sure to cleanup the failure mode even if the test fails.
48+
defer func() {
49+
err = errors.CombineErrors(err, failureMode.Cleanup(ctx, l, t.args))
50+
}()
4651
if err = failureMode.Setup(ctx, l, t.args); err != nil {
4752
return err
4853
}
@@ -67,13 +72,7 @@ func (t *failureSmokeTest) run(
6772
return err
6873
}
6974

70-
if err = t.validateRestore(ctx, l, c); err != nil {
71-
return err
72-
}
73-
if err = failureMode.Cleanup(ctx, l, t.args); err != nil {
74-
return err
75-
}
76-
return nil
75+
return t.validateRestore(ctx, l, c)
7776
}
7877

7978
func (t *failureSmokeTest) noopRun(
@@ -233,6 +232,69 @@ var asymmetricOutgoingNetworkPartitionTest = func(c cluster.Cluster) failureSmok
233232
}
234233
}
235234

235+
var latencyTest = func(c cluster.Cluster) failureSmokeTest {
236+
nodes := c.CRDBNodes()
237+
rand.Shuffle(len(nodes), func(i, j int) {
238+
nodes[i], nodes[j] = nodes[j], nodes[i]
239+
})
240+
srcNode := nodes[0]
241+
destNode := nodes[1]
242+
unaffectedNode := nodes[2]
243+
return failureSmokeTest{
244+
testName: "Network Latency",
245+
failureName: failures.NetworkLatencyName,
246+
args: failures.NetworkLatencyArgs{
247+
ArtificialLatencies: []failures.ArtificialLatency{
248+
{
249+
Source: install.Nodes{install.Node(srcNode)},
250+
Destination: install.Nodes{install.Node(destNode)},
251+
Delay: 2 * time.Second,
252+
},
253+
{
254+
Source: install.Nodes{install.Node(destNode)},
255+
Destination: install.Nodes{install.Node(srcNode)},
256+
Delay: 2 * time.Second,
257+
},
258+
},
259+
},
260+
validateFailure: func(ctx context.Context, l *logger.Logger, c cluster.Cluster) error {
261+
// Note that this is one way latency, since the sender doesn't have the matching port.
262+
delayedLatency, err := roachtestutil.PortLatency(ctx, l, c, c.Nodes(srcNode), c.Nodes(destNode))
263+
if err != nil {
264+
return err
265+
}
266+
normalLatency, err := roachtestutil.PortLatency(ctx, l, c, c.Nodes(unaffectedNode), c.Nodes(destNode))
267+
if err != nil {
268+
return err
269+
}
270+
if delayedLatency < normalLatency*2 {
271+
return errors.Errorf("expected latency between nodes with artificial latency (n%d and n%d) to be much higher than between nodes without (n%d and n%d)", srcNode, destNode, unaffectedNode, destNode)
272+
}
273+
if delayedLatency < time.Second || delayedLatency > 3*time.Second {
274+
return errors.Errorf("expected latency between nodes with artificial latency (n%d and n%d) to be at least within 1s and 3s", srcNode, destNode)
275+
}
276+
return nil
277+
},
278+
validateRestore: func(ctx context.Context, l *logger.Logger, c cluster.Cluster) error {
279+
delayedLatency, err := roachtestutil.PortLatency(ctx, l, c, c.Nodes(srcNode), c.Nodes(destNode))
280+
if err != nil {
281+
return err
282+
}
283+
normalLatency, err := roachtestutil.PortLatency(ctx, l, c, c.Nodes(unaffectedNode), c.Nodes(destNode))
284+
if err != nil {
285+
return err
286+
}
287+
if delayedLatency > 2*normalLatency {
288+
return errors.Errorf("expected latency between nodes with artificial latency (n%d and n%d) to be close to latency between nodes without (n%d and n%d)", srcNode, destNode, unaffectedNode, destNode)
289+
}
290+
if delayedLatency > 500*time.Millisecond {
291+
return errors.Errorf("expected latency between nodes with artificial latency (n%d and n%d) to have restored to at least less than 500ms", srcNode, destNode)
292+
}
293+
return nil
294+
},
295+
}
296+
}
297+
236298
func setupFailureSmokeTests(ctx context.Context, t test.Test, c cluster.Cluster) error {
237299
// Download any dependencies needed.
238300
if err := c.Install(ctx, t.L(), c.CRDBNodes(), "nmap"); err != nil {
@@ -258,6 +320,7 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
258320
bidirectionalNetworkPartitionTest(c),
259321
asymmetricIncomingNetworkPartitionTest(c),
260322
asymmetricOutgoingNetworkPartitionTest(c),
323+
latencyTest(c),
261324
}
262325

263326
// Randomize the order of the tests in case any of the failures have unexpected side
@@ -284,7 +347,7 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
284347

285348
func registerFISmokeTest(r registry.Registry) {
286349
r.Add(registry.TestSpec{
287-
Name: "failure-injection-smoke-test",
350+
Name: "failure-injection/smoke-test",
288351
Owner: registry.OwnerTestEng,
289352
Cluster: r.MakeClusterSpec(4, spec.WorkloadNode(), spec.CPU(2), spec.WorkloadNodeCPU(2), spec.ReuseNone()),
290353
CompatibleClouds: registry.OnlyGCE,
@@ -295,7 +358,7 @@ func registerFISmokeTest(r registry.Registry) {
295358
},
296359
})
297360
r.Add(registry.TestSpec{
298-
Name: "failure-injection-noop-smoke-test",
361+
Name: "failure-injection/smoke-test/noop",
299362
Owner: registry.OwnerTestEng,
300363
Cluster: r.MakeClusterSpec(4, spec.WorkloadNode(), spec.CPU(2), spec.WorkloadNodeCPU(2), spec.ReuseNone()),
301364
CompatibleClouds: registry.OnlyGCE,

pkg/roachprod/failureinjection/failures/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ go_library(
44
name = "failures",
55
srcs = [
66
"failure.go",
7+
"latency.go",
78
"network_partition.go",
89
"registry.go",
910
],
@@ -13,5 +14,6 @@ go_library(
1314
"//pkg/roachprod",
1415
"//pkg/roachprod/install",
1516
"//pkg/roachprod/logger",
17+
"@com_github_cockroachdb_errors//:errors",
1618
],
1719
)

pkg/roachprod/failureinjection/failures/failure.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111

1212
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
1313
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
14+
"github.com/cockroachdb/errors"
1415
)
1516

1617
// FailureArgs describes the args passed to a failure mode.
@@ -35,7 +36,8 @@ type FailureMode interface {
3536
// Inject a failure into the system.
3637
Inject(ctx context.Context, l *logger.Logger, args FailureArgs) error
3738

38-
// Restore reverses the effects of Inject.
39+
// Restore reverses the effects of Inject. The same args passed to Inject
40+
// must be passed to Restore.
3941
Restore(ctx context.Context, l *logger.Logger, args FailureArgs) error
4042

4143
// Cleanup uninstalls any dependencies that were installed by Setup.
@@ -52,9 +54,11 @@ type FailureMode interface {
5254
// provide commonly used functionality that doesn't differ between failure modes,
5355
// e.g. running remote commands on the cluster.
5456
type GenericFailure struct {
57+
// TODO(Darryl): support specifying virtual clusters
5558
c *install.SyncedCluster
5659
// runTitle is the title to prefix command output with.
57-
runTitle string
60+
runTitle string
61+
networkInterfaces []string
5862
}
5963

6064
func (f *GenericFailure) Run(
@@ -84,3 +88,22 @@ func (f *GenericFailure) RunWithDetails(
8488
}
8589
return res[0], nil
8690
}
91+
92+
// NetworkInterfaces returns the network interfaces used by the VMs in the cluster.
93+
// Assumes that all VMs are using the same machine type and will have the same
94+
// network interfaces.
95+
func (f *GenericFailure) NetworkInterfaces(
96+
ctx context.Context, l *logger.Logger,
97+
) ([]string, error) {
98+
if f.networkInterfaces == nil {
99+
res, err := f.c.RunWithDetails(ctx, l, install.WithNodes(f.c.Nodes[:1]), "Get Network Interfaces", "ip -o link show | awk -F ': ' '{print $2}'")
100+
if err != nil {
101+
return nil, errors.Wrapf(err, "error when determining network interfaces")
102+
}
103+
interfaces := strings.Split(strings.TrimSpace(res[0].Stdout), "\n")
104+
for _, iface := range interfaces {
105+
f.networkInterfaces = append(f.networkInterfaces, strings.TrimSpace(iface))
106+
}
107+
}
108+
return f.networkInterfaces, nil
109+
}

0 commit comments

Comments
 (0)