|
| 1 | +// Copyright 2023 The Cockroach Authors. |
| 2 | +// |
| 3 | +// Use of this software is governed by the Business Source License |
| 4 | +// included in the file licenses/BSL.txt. |
| 5 | +// |
| 6 | +// As of the Change Date specified in that file, in accordance with |
| 7 | +// the Business Source License, use of this software will be governed |
| 8 | +// by the Apache License, Version 2.0, included in the file |
| 9 | +// licenses/APL.txt. |
| 10 | + |
| 11 | +package tests |
| 12 | + |
| 13 | +import ( |
| 14 | + "context" |
| 15 | + "fmt" |
| 16 | + "strings" |
| 17 | + |
| 18 | + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" |
| 19 | + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" |
| 20 | + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" |
| 21 | + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" |
| 22 | + "github.com/cockroachdb/cockroach/pkg/roachprod" |
| 23 | + "github.com/cockroachdb/cockroach/pkg/roachprod/install" |
| 24 | + "github.com/stretchr/testify/require" |
| 25 | +) |
| 26 | + |
| 27 | +const ( |
| 28 | + // 3 node CRDB cluster, plus 1 node for workload |
| 29 | + numNodesNetworkLogging = 4 |
| 30 | + fluentBitTCPPort = 5170 |
| 31 | + // YAML template string defining a FluentBit and HTTP log sink, both with buffering enabled. |
| 32 | + // Container ports for both sinks are left to be interpolated (%d). |
| 33 | + logConfigTemplate = "{ http-defaults: { format: json-fluent, buffering: { max-staleness: 5s, flush-trigger-size: 1.0MiB, max-buffer-size: 50MiB } }, fluent-defaults: { format: json-fluent, buffering: { max-staleness: 5s, flush-trigger-size: 1.0MiB, max-buffer-size: 50MiB } }, sinks: { fluent-servers: { test-output: { channels: {INFO: all}, net: tcp, address: localhost:%d, filter: INFO, redact: false } }, http-servers: { test-output: { channels: {INFO: all}, address: http://localhost:%d, filter: INFO, method: POST, unsafe-tls: true } } } }" |
| 34 | +) |
| 35 | + |
| 36 | +func registerNetworkLogging(r registry.Registry) { |
| 37 | + runNetworkLogging := func( |
| 38 | + ctx context.Context, |
| 39 | + t test.Test, |
| 40 | + c cluster.Cluster, |
| 41 | + ) { |
| 42 | + crdbNodes := c.Range(1, c.Spec().NodeCount-1) |
| 43 | + workloadNode := c.Node(c.Spec().NodeCount) |
| 44 | + |
| 45 | + // Install Docker, which we'll use for FluentBit. |
| 46 | + t.Status("installing docker") |
| 47 | + if err := c.Install(ctx, t.L(), crdbNodes, "docker"); err != nil { |
| 48 | + t.Fatalf("failed to install docker: %v", err) |
| 49 | + } |
| 50 | + |
| 51 | + t.Status("installing FluentBit containers on CRDB nodes") |
| 52 | + // Create FluentBit container on the node with a TCP input and dev/null output. |
| 53 | + err := c.RunE(ctx, crdbNodes, fmt.Sprintf( |
| 54 | + "sudo docker run -d -p %d:%d --name=fluentbit fluent/fluent-bit -i tcp -o null", |
| 55 | + fluentBitTCPPort, |
| 56 | + fluentBitTCPPort)) |
| 57 | + if err != nil { |
| 58 | + t.Fatalf("failed to install FluentBit containers: %v", err) |
| 59 | + } |
| 60 | + |
| 61 | + // Install Cockroach, including on the workload node, |
| 62 | + // since we'll use ./cockroach workload. |
| 63 | + t.Status("installing cockroach") |
| 64 | + c.Put(ctx, t.Cockroach(), "./cockroach", c.All()) |
| 65 | + |
| 66 | + // Start each node with a log config containing fluent-server and http-server sinks. |
| 67 | + t.Status("starting cockroach on nodes") |
| 68 | + startOpts := option.DefaultStartOpts() |
| 69 | + logCfg := fmt.Sprintf(logConfigTemplate, fluentBitTCPPort, fluentBitTCPPort) |
| 70 | + startOpts.RoachprodOpts.ExtraArgs = []string{ |
| 71 | + "--log", logCfg, |
| 72 | + } |
| 73 | + c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(install.SecureOption(true)), crdbNodes) |
| 74 | + |
| 75 | + // Construct pgurls for the workload runner. As a roundabout way of detecting deadlocks, |
| 76 | + // we set a client timeout on the workload pgclient. If the server becomes unavailable |
| 77 | + // due to a deadlock, the timeout will eventually trigger and cause the test to fail. |
| 78 | + // We've had network logging bugs in the past that deadlocked without the nodes dying, |
| 79 | + // so this helps detect such a case. |
| 80 | + secureUrls, err := roachprod.PgURL(ctx, |
| 81 | + t.L(), |
| 82 | + c.MakeNodes(crdbNodes), |
| 83 | + "certs", /* certsDir */ |
| 84 | + roachprod.PGURLOptions{ |
| 85 | + External: false, |
| 86 | + Secure: true}) |
| 87 | + require.NoError(t, err) |
| 88 | + workloadPGURLs := make([]string, len(secureUrls)) |
| 89 | + for i, url := range secureUrls { |
| 90 | + // URLs already are wrapped in '', but we need to add a timeout flag. |
| 91 | + // Trim the trailing ' and re-add with the flag. |
| 92 | + trimmed := strings.TrimSuffix(url, "'") |
| 93 | + workloadPGURLs[i] = fmt.Sprintf("%s&statement_timeout=10s'", trimmed) |
| 94 | + } |
| 95 | + |
| 96 | + // Init & run a workload on the workload node. |
| 97 | + t.Status("initializing workload") |
| 98 | + initWorkloadCmd := fmt.Sprintf("./cockroach workload init kv %s ", secureUrls[0]) |
| 99 | + c.Run(ctx, workloadNode, initWorkloadCmd) |
| 100 | + |
| 101 | + t.Status("running workload") |
| 102 | + m := c.NewMonitor(ctx, crdbNodes) |
| 103 | + m.Go(func(ctx context.Context) error { |
| 104 | + joinedURLs := strings.Join(workloadPGURLs, " ") |
| 105 | + runWorkloadCmd := fmt.Sprintf("./cockroach workload run kv --concurrency=32 --duration=1h %s", joinedURLs) |
| 106 | + return c.RunE(ctx, workloadNode, runWorkloadCmd) |
| 107 | + }) |
| 108 | + m.Wait() |
| 109 | + } |
| 110 | + |
| 111 | + r.Add(registry.TestSpec{ |
| 112 | + Name: "network_logging", |
| 113 | + Owner: registry.OwnerObsInf, |
| 114 | + Cluster: r.MakeClusterSpec(numNodesNetworkLogging), |
| 115 | + Leases: registry.MetamorphicLeases, |
| 116 | + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { |
| 117 | + runNetworkLogging(ctx, t, c) |
| 118 | + }, |
| 119 | + }) |
| 120 | +} |
0 commit comments