Skip to content

Commit 70a5801

Browse files
craig[bot]abarganierrailrafiss
committed
107730: roachtest: add network_logging test r=dhartunian,renatolabs a=abarganier Currently, network log features such as the `http-server` and `fluent-server` are not exercised as part of our roachtest framework. Because of this, the feature is largely untested in an e2e environment, leaving a large gap in our test coverage. This patch adds a new roachtest, `network_logging`, that exercises both of these features. By default, these types of log sinks are buffered via the bufferedSink code, so the tests also exercise that code path. The test provisions a cluster and runs a workload against it. Each CRDB node is accompanied by a Docker container running FluentBit. The FluentBit containers themselves don't do anything other than receive logs, and then send them to dev/null. Again, the goal of the test is to exercise the *output* of the logs from CRDB, not what's done with them externally on the receiving end, hence dev/null. The workload run is kv, for a duration of 1h. The type of workload is less important than the fact that logs are flowing through the fluent/http sinks. A statement_timeout option is included in the pgurl provided to the workload, which will be triggered if the nodes become unavailable due to something like a deadlock scenario. Release note: none Addresses: cockroachdb#105357 Epic: CC-9681 107940: release: revert "run preflight on the latest image" r=celiala a=rail This reverts commit 59b26a5. Running `preflight` against the `latest` tag sometimes results in a race condition. The registry refuses to update the existing image and this behaviour is not persistent. Additionally, submitting the `latest` tag does not automatically publish it, we still need to publishing via UI or API. Epic: none Release note: None 107959: roachtest: add row-level-ttl/during/tpcc test r=rafiss a=rafiss This tests the impact that row level TTL has on foreground TPCC traffic. Two modes are available: in one, no rows are deleted, but the TTL job still scans for expired rows; in the other, rows are scanned and deleted. fixes cockroachdb#107496 Release note: None Co-authored-by: Alex Barganier <[email protected]> Co-authored-by: Rail Aliiev <[email protected]> Co-authored-by: Rafi Shamim <[email protected]>
4 parents 509f8c8 + 4cca4de + 07e10df + 408ecd2 commit 70a5801

File tree

5 files changed

+207
-23
lines changed

5 files changed

+207
-23
lines changed

build/teamcity/internal/cockroach/release/publish/publish-redhat-release.sh

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ fi
6161
tc_end_block "Tag docker images as latest"
6262

6363
tc_start_block "Run preflight"
64-
mkdir -p artifacts/preflight
64+
mkdir -p artifacts
6565
docker run \
6666
--rm \
6767
--security-opt=label=disable \
@@ -72,30 +72,9 @@ docker run \
7272
--env PFLT_PYXIS_API_TOKEN="$REDHAT_API_TOKEN" \
7373
--env PFLT_DOCKERCONFIG=/temp-authfile.json \
7474
--env DOCKER_CONFIG=/tmp/docker \
75-
-v "$PWD/artifacts/preflight:/artifacts" \
75+
-v "$PWD/artifacts:/artifacts" \
7676
-v ~/.docker/config.json:/temp-authfile.json:ro \
7777
-v ~/.docker/config.json:/tmp/docker/config.json:ro \
7878
quay.io/opdev/preflight:stable check container \
7979
"${rhel_repository}:${version}" --submit
8080
tc_end_block "Run preflight"
81-
82-
if [[ -n "${PUBLISH_LATEST}" ]]; then
83-
tc_start_block "Run preflight on latest"
84-
mkdir -p artifacts/preflight-latest
85-
docker run \
86-
--rm \
87-
--security-opt=label=disable \
88-
--env PFLT_LOGLEVEL=trace \
89-
--env PFLT_ARTIFACTS=/artifacts \
90-
--env PFLT_LOGFILE=/artifacts/preflight.log \
91-
--env PFLT_CERTIFICATION_PROJECT_ID="$rhel_project_id" \
92-
--env PFLT_PYXIS_API_TOKEN="$REDHAT_API_TOKEN" \
93-
--env PFLT_DOCKERCONFIG=/temp-authfile.json \
94-
--env DOCKER_CONFIG=/tmp/docker \
95-
-v "$PWD/artifacts/preflight-latest:/artifacts" \
96-
-v ~/.docker/config.json:/temp-authfile.json:ro \
97-
-v ~/.docker/config.json:/tmp/docker/config.json:ro \
98-
quay.io/opdev/preflight:stable check container \
99-
"${rhel_repository}:latest" --submit
100-
tc_end_block "Run preflight on latest"
101-
fi

pkg/cmd/roachtest/tests/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ go_library(
109109
"multitenant_utils.go",
110110
"mvcc_gc.go",
111111
"network.go",
112+
"network_logging.go",
112113
"nodejs_postgres.go",
113114
"npgsql.go",
114115
"npgsql_blocklist.go",
@@ -137,6 +138,7 @@ go_library(
137138
"restore.go",
138139
"roachmart.go",
139140
"roachtest.go",
141+
"row_level_ttl.go",
140142
"ruby_pg.go",
141143
"ruby_pg_blocklist.go",
142144
"rust_postgres.go",
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// Copyright 2023 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the Business Source License
4+
// included in the file licenses/BSL.txt.
5+
//
6+
// As of the Change Date specified in that file, in accordance with
7+
// the Business Source License, use of this software will be governed
8+
// by the Apache License, Version 2.0, included in the file
9+
// licenses/APL.txt.
10+
11+
package tests
12+
13+
import (
14+
"context"
15+
"fmt"
16+
"strings"
17+
18+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
19+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
20+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
21+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
22+
"github.com/cockroachdb/cockroach/pkg/roachprod"
23+
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
24+
"github.com/stretchr/testify/require"
25+
)
26+
27+
const (
28+
// 3 node CRDB cluster, plus 1 node for workload
29+
numNodesNetworkLogging = 4
30+
fluentBitTCPPort = 5170
31+
// YAML template string defining a FluentBit and HTTP log sink, both with buffering enabled.
32+
// Container ports for both sinks are left to be interpolated (%d).
33+
logConfigTemplate = "{ http-defaults: { format: json-fluent, buffering: { max-staleness: 5s, flush-trigger-size: 1.0MiB, max-buffer-size: 50MiB } }, fluent-defaults: { format: json-fluent, buffering: { max-staleness: 5s, flush-trigger-size: 1.0MiB, max-buffer-size: 50MiB } }, sinks: { fluent-servers: { test-output: { channels: {INFO: all}, net: tcp, address: localhost:%d, filter: INFO, redact: false } }, http-servers: { test-output: { channels: {INFO: all}, address: http://localhost:%d, filter: INFO, method: POST, unsafe-tls: true } } } }"
34+
)
35+
36+
func registerNetworkLogging(r registry.Registry) {
37+
runNetworkLogging := func(
38+
ctx context.Context,
39+
t test.Test,
40+
c cluster.Cluster,
41+
) {
42+
crdbNodes := c.Range(1, c.Spec().NodeCount-1)
43+
workloadNode := c.Node(c.Spec().NodeCount)
44+
45+
// Install Docker, which we'll use for FluentBit.
46+
t.Status("installing docker")
47+
if err := c.Install(ctx, t.L(), crdbNodes, "docker"); err != nil {
48+
t.Fatalf("failed to install docker: %v", err)
49+
}
50+
51+
t.Status("installing FluentBit containers on CRDB nodes")
52+
// Create FluentBit container on the node with a TCP input and dev/null output.
53+
err := c.RunE(ctx, crdbNodes, fmt.Sprintf(
54+
"sudo docker run -d -p %d:%d --name=fluentbit fluent/fluent-bit -i tcp -o null",
55+
fluentBitTCPPort,
56+
fluentBitTCPPort))
57+
if err != nil {
58+
t.Fatalf("failed to install FluentBit containers: %v", err)
59+
}
60+
61+
// Install Cockroach, including on the workload node,
62+
// since we'll use ./cockroach workload.
63+
t.Status("installing cockroach")
64+
c.Put(ctx, t.Cockroach(), "./cockroach", c.All())
65+
66+
// Start each node with a log config containing fluent-server and http-server sinks.
67+
t.Status("starting cockroach on nodes")
68+
startOpts := option.DefaultStartOpts()
69+
logCfg := fmt.Sprintf(logConfigTemplate, fluentBitTCPPort, fluentBitTCPPort)
70+
startOpts.RoachprodOpts.ExtraArgs = []string{
71+
"--log", logCfg,
72+
}
73+
c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(install.SecureOption(true)), crdbNodes)
74+
75+
// Construct pgurls for the workload runner. As a roundabout way of detecting deadlocks,
76+
// we set a client timeout on the workload pgclient. If the server becomes unavailable
77+
// due to a deadlock, the timeout will eventually trigger and cause the test to fail.
78+
// We've had network logging bugs in the past that deadlocked without the nodes dying,
79+
// so this helps detect such a case.
80+
secureUrls, err := roachprod.PgURL(ctx,
81+
t.L(),
82+
c.MakeNodes(crdbNodes),
83+
"certs", /* certsDir */
84+
roachprod.PGURLOptions{
85+
External: false,
86+
Secure: true})
87+
require.NoError(t, err)
88+
workloadPGURLs := make([]string, len(secureUrls))
89+
for i, url := range secureUrls {
90+
// URLs already are wrapped in '', but we need to add a timeout flag.
91+
// Trim the trailing ' and re-add with the flag.
92+
trimmed := strings.TrimSuffix(url, "'")
93+
workloadPGURLs[i] = fmt.Sprintf("%s&statement_timeout=10s'", trimmed)
94+
}
95+
96+
// Init & run a workload on the workload node.
97+
t.Status("initializing workload")
98+
initWorkloadCmd := fmt.Sprintf("./cockroach workload init kv %s ", secureUrls[0])
99+
c.Run(ctx, workloadNode, initWorkloadCmd)
100+
101+
t.Status("running workload")
102+
m := c.NewMonitor(ctx, crdbNodes)
103+
m.Go(func(ctx context.Context) error {
104+
joinedURLs := strings.Join(workloadPGURLs, " ")
105+
runWorkloadCmd := fmt.Sprintf("./cockroach workload run kv --concurrency=32 --duration=1h %s", joinedURLs)
106+
return c.RunE(ctx, workloadNode, runWorkloadCmd)
107+
})
108+
m.Wait()
109+
}
110+
111+
r.Add(registry.TestSpec{
112+
Name: "network_logging",
113+
Owner: registry.OwnerObsInf,
114+
Cluster: r.MakeClusterSpec(numNodesNetworkLogging),
115+
Leases: registry.MetamorphicLeases,
116+
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
117+
runNetworkLogging(ctx, t, c)
118+
},
119+
})
120+
}

pkg/cmd/roachtest/tests/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ func RegisterTests(r registry.Registry) {
9494
registerMultiTenantUpgrade(r)
9595
registerMultiTenantSharedProcess(r)
9696
registerNetwork(r)
97+
registerNetworkLogging(r)
9798
registerNodeJSPostgres(r)
9899
registerNpgsql(r)
99100
registerPebbleWriteThroughput(r)
@@ -114,6 +115,7 @@ func RegisterTests(r registry.Registry) {
114115
registerRestoreNodeShutdown(r)
115116
registerRoachmart(r)
116117
registerRoachtest(r)
118+
registerRowLevelTTLDuringTPCC(r)
117119
registerRubyPG(r)
118120
registerRustPostgres(r)
119121
registerSQLAlchemy(r)
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// Copyright 2023 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the Business Source License
4+
// included in the file licenses/BSL.txt.
5+
//
6+
// As of the Change Date specified in that file, in accordance with
7+
// the Business Source License, use of this software will be governed
8+
// by the Apache License, Version 2.0, included in the file
9+
// licenses/APL.txt.
10+
11+
package tests
12+
13+
import (
14+
"context"
15+
"fmt"
16+
"time"
17+
18+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
19+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
20+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
21+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
22+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
23+
)
24+
25+
func registerRowLevelTTLDuringTPCC(r registry.Registry) {
26+
const nodes = 7
27+
var clusterSpec = spec.CPU(4)
28+
const warehouses = 1500
29+
const activeWarehouses = 100
30+
const duration = 30 * time.Minute
31+
r.Add(makeRowLevelTTLDuringTPCC(r.MakeClusterSpec(nodes, clusterSpec), warehouses, activeWarehouses, duration, false /* expiredRows */))
32+
r.Add(makeRowLevelTTLDuringTPCC(r.MakeClusterSpec(nodes, clusterSpec), warehouses, activeWarehouses, duration, true /* expiredRows */))
33+
}
34+
35+
func makeRowLevelTTLDuringTPCC(
36+
spec spec.ClusterSpec, warehouses, activeWarehouses int, length time.Duration, expiredRows bool,
37+
) registry.TestSpec {
38+
return registry.TestSpec{
39+
Name: fmt.Sprintf("row-level-ttl/during/tpcc/expired-rows=%t", expiredRows),
40+
Owner: registry.OwnerSQLFoundations,
41+
Benchmark: true,
42+
Cluster: spec,
43+
Leases: registry.MetamorphicLeases,
44+
Timeout: length * 3,
45+
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
46+
runTPCC(ctx, t, c, tpccOptions{
47+
Warehouses: warehouses,
48+
// We limit the number of workers because the default results in a lot
49+
// of connections which can lead to OOM issues (see #40566).
50+
ExtraRunArgs: fmt.Sprintf("--wait=false --tolerate-errors --max-rate=100 --active-warehouses=%d --workers=%d", activeWarehouses, warehouses),
51+
// The expired-rows test will delete rows from the order_line table, so
52+
// the post run checks are expected to fail.
53+
SkipPostRunCheck: expiredRows,
54+
During: func(ctx context.Context) error {
55+
nowMinute := timeutil.Now().Minute()
56+
scheduledMinute := (nowMinute + 10) % 60
57+
var expirationExpr string
58+
if expiredRows {
59+
expirationExpr = `'((ol_delivery_d::TIMESTAMP) + INTERVAL ''1 days'') AT TIME ZONE ''UTC'''`
60+
} else {
61+
// The TPCC fixtures have dates from 2006 for the ol_delivery_d column.
62+
expirationExpr = `'((ol_delivery_d::TIMESTAMP) + INTERVAL ''1000 years'') AT TIME ZONE ''UTC'''`
63+
}
64+
ttlStatement := fmt.Sprintf(`
65+
ALTER TABLE tpcc.public.order_line SET (
66+
ttl_expiration_expression=%s,
67+
ttl_job_cron='%d * * * *'
68+
);`, expirationExpr, scheduledMinute,
69+
)
70+
71+
if err := runAndLogStmts(ctx, t, c, "enable-ttl", []string{ttlStatement}); err != nil {
72+
return err
73+
}
74+
return nil
75+
},
76+
Duration: length,
77+
SetupType: usingImport,
78+
})
79+
},
80+
}
81+
}

0 commit comments

Comments
 (0)