Skip to content

Commit 2d01e6d

Browse files
committed
failure-injection: fix cgroup disk stall for logs
The previous implementation of stalling logs had a comment about needing to temporarily move logs while a symlink was created. This comment changes the disk stall failure mode to actually do so.
1 parent dda5160 commit 2d01e6d

File tree

2 files changed

+124
-15
lines changed

2 files changed

+124
-15
lines changed

pkg/cmd/roachtest/tests/failure_injection.go

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,71 @@ var cgroupsDiskStallTests = func(c cluster.Cluster) []failureSmokeTest {
557557
return tests
558558
}
559559

560+
var cgroupStallLogsTest = func(c cluster.Cluster) failureSmokeTest {
561+
nodes := c.CRDBNodes()
562+
rand.Shuffle(len(nodes), func(i, j int) {
563+
nodes[i], nodes[j] = nodes[j], nodes[i]
564+
})
565+
// We only want to stall one node for this test. If we stall writes on a quorum
566+
// of nodes, then auth-session login will fail even if logs are not stalled.
567+
stalledNode := c.Node(nodes[0])
568+
unaffectedNode := c.Node(nodes[1])
569+
570+
getSessionCookie := func(
571+
ctx context.Context,
572+
l *logger.Logger,
573+
c cluster.Cluster,
574+
node option.NodeListOption,
575+
) bool {
576+
loginCmd := fmt.Sprintf(
577+
"%s auth-session login root --url={pgurl%s} --certs-dir ./certs --only-cookie",
578+
test.DefaultCockroachPath, node,
579+
)
580+
err := c.RunE(ctx, option.WithNodes(node), loginCmd)
581+
return err == nil
582+
}
583+
584+
return failureSmokeTest{
585+
testName: fmt.Sprintf("%s/WritesStalled=true/LogsStalled=true", failures.CgroupsDiskStallName),
586+
failureName: failures.CgroupsDiskStallName,
587+
args: failures.DiskStallArgs{
588+
StallWrites: true,
589+
RestartNodes: true,
590+
Nodes: stalledNode.InstallNodes(),
591+
StallLogs: true,
592+
},
593+
validateFailure: func(ctx context.Context, l *logger.Logger, c cluster.Cluster, f *failures.Failer) error {
594+
// Confirm symlink exists
595+
if err := c.RunE(ctx, option.WithNodes(stalledNode), "test -L logs"); err != nil {
596+
return errors.Wrapf(err, "`logs` is not a symlink on node %d", stalledNode)
597+
}
598+
599+
// The cockroach-sql-auth.log file is appended to each time an authenticated session event
600+
// occurs, e.g. a client logging in. If we attempt to fetch a cookie from the stalled node,
601+
// we should expect to see our request time out, as it will be unable to write to the log.
602+
if getSessionCookie(ctx, l, c, stalledNode) {
603+
return errors.Errorf("was able to successfully get session cookie from stalled node %d", stalledNode)
604+
}
605+
606+
// The unaffected node should be able to write to the log, so we should be able to
607+
// get the session cookie with no issues.
608+
if !getSessionCookie(ctx, l, c, unaffectedNode) {
609+
return errors.Errorf("was unable to get session cookie from unaffected node %d", unaffectedNode)
610+
}
611+
return nil
612+
},
613+
validateRecover: func(ctx context.Context, l *logger.Logger, c cluster.Cluster, f *failures.Failer) error {
614+
if !getSessionCookie(ctx, l, c, stalledNode) {
615+
return errors.Errorf("was unable to get session cookie from stalled node %d", stalledNode)
616+
}
617+
return nil
618+
},
619+
workload: func(ctx context.Context, c cluster.Cluster, args ...string) error {
620+
return nil
621+
},
622+
}
623+
}
624+
560625
var dmsetupDiskStallTest = func(c cluster.Cluster) failureSmokeTest {
561626
rng, _ := randutil.NewPseudoRand()
562627
// SeededRandGroups only returns an error if the requested size is larger than the
@@ -776,9 +841,7 @@ func defaultFailureSmokeTestWorkload(ctx context.Context, c cluster.Cluster, arg
776841
return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), cmd)
777842
}
778843

779-
func setupFailureSmokeTests(
780-
ctx context.Context, t test.Test, c cluster.Cluster, fr *failures.FailureRegistry,
781-
) error {
844+
func setupFailureSmokeTests(ctx context.Context, t test.Test, c cluster.Cluster) error {
782845
// Download any dependencies needed.
783846
if err := c.Install(ctx, t.L(), c.CRDBNodes(), "nmap"); err != nil {
784847
return err
@@ -804,7 +867,7 @@ func setupFailureSmokeTests(
804867
func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, noopFailer bool) {
805868
fr := failures.NewFailureRegistry()
806869
fr.Register()
807-
if err := setupFailureSmokeTests(ctx, t, c, fr); err != nil {
870+
if err := setupFailureSmokeTests(ctx, t, c); err != nil {
808871
t.Error(err)
809872
}
810873

@@ -815,6 +878,7 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
815878
latencyTest(c),
816879
dmsetupDiskStallTest(c),
817880
resetVMTests(c),
881+
cgroupStallLogsTest(c),
818882
}
819883
failureSmokeTests = append(failureSmokeTests, cgroupsDiskStallTests(c)...)
820884
failureSmokeTests = append(failureSmokeTests, processKillTests(c)...)

pkg/roachprod/failureinjection/failures/disk_stall.go

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/cockroachdb/cockroach/pkg/roachprod"
1818
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
1919
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
20+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2021
"github.com/cockroachdb/errors"
2122
)
2223

@@ -63,27 +64,58 @@ func (s *CGroupDiskStaller) Description() string {
6364
func (s *CGroupDiskStaller) Setup(ctx context.Context, l *logger.Logger, args FailureArgs) error {
6465
diskStallArgs := args.(DiskStallArgs)
6566

66-
// To stall logs we need to create a symlink that points to our stalled
67-
// store directory. In order to do that we need to temporarily move the
68-
// existing logs directory and copy the contents over after. If a symlink
69-
// already exists, don't attempt to recreate it.
67+
// Cgroup throttles a specific disk device, however our logs directory
68+
// is usually mounted on a different device than our cockroach data. To
69+
// stall both logs and the cockroach process, they must both be mounted
70+
// on the same device. To do so, we create a new logs directory in our
71+
// stalled device, e.g. {store-dir}/logs, and create a symlink from logs
72+
// to that directory.
73+
//
74+
// If the cluster is already running, we want to make sure we don't lose
75+
// any existing logs. We first move our existing logs to a temporary
76+
// directory, before copying them into the new symlinked directory.
7077
if diskStallArgs.StallLogs {
71-
createSymlinkCmd := `
78+
// N.B. Because multiple FS operations aren't atomic, we must temporarily
79+
// stop the cluster before moving the logs directory.
80+
if diskStallArgs.RestartNodes {
81+
if err := s.StopCluster(ctx, l, roachprod.DefaultStopOpts()); err != nil {
82+
return err
83+
}
84+
}
85+
86+
tmpLogsDir := fmt.Sprintf("tmp-disk-stall-%d", timeutil.Now().Unix())
87+
createSymlinkCmd := fmt.Sprintf(`
7288
if [ ! -L logs ]; then
73-
echo "creating symlink";
74-
mkdir -p {store-dir}/logs;
75-
ln -s {store-dir}/logs logs;
89+
if [ -e logs ]; then
90+
echo "moving existing logs to tmp directory %[1]s"
91+
mv logs %[1]s
92+
fi
93+
mkdir -p {store-dir}/logs
94+
echo "creating symlink logs -> {store-dir}/logs";
95+
ln -s {store-dir}/logs logs
96+
if [ -e %[1]s ]; then
97+
echo "copying tmp directory %[1]s to logs";
98+
cp -va %[1]s/* logs/
99+
fi
100+
else
101+
echo "symlink already exists, not creating";
76102
fi
77-
`
103+
`, tmpLogsDir)
78104
if err := s.Run(ctx, l, diskStallArgs.Nodes, createSymlinkCmd); err != nil {
79105
return err
80106
}
107+
if diskStallArgs.RestartNodes {
108+
if err := s.StartCluster(ctx, l); err != nil {
109+
return err
110+
}
111+
}
81112
}
82113
return nil
83114
}
84115
func (s *CGroupDiskStaller) Cleanup(ctx context.Context, l *logger.Logger, args FailureArgs) error {
116+
diskStallArgs := args.(DiskStallArgs)
85117
stallType := []bandwidthType{readBandwidth, writeBandwidth}
86-
nodes := args.(DiskStallArgs).Nodes
118+
nodes := diskStallArgs.Nodes
87119

88120
// Setting cgroup limits is idempotent so attempt to unlimit reads/writes in case
89121
// something went wrong in Recover.
@@ -92,9 +124,22 @@ func (s *CGroupDiskStaller) Cleanup(ctx context.Context, l *logger.Logger, args
92124
l.PrintfCtx(ctx, "error unstalling the disk; stumbling on: %v", err)
93125
}
94126
if args.(DiskStallArgs).StallLogs {
95-
if err = s.Run(ctx, l, nodes, "unlink logs/logs"); err != nil {
127+
// Cleanup our symlinked logs. Similar to Setup(), we must first stop the cluster
128+
// to stop the cockroach process from concurrently writing to the logs directory.
129+
if err = s.Run(ctx, l, nodes, "unlink logs"); err != nil {
96130
return err
97131
}
132+
if diskStallArgs.RestartNodes {
133+
if err = s.StopCluster(ctx, l, roachprod.DefaultStopOpts()); err != nil {
134+
return err
135+
}
136+
}
137+
if err = s.Run(ctx, l, nodes, "cp -r {store-dir}/logs logs"); err != nil {
138+
return err
139+
}
140+
if diskStallArgs.RestartNodes {
141+
return s.StartCluster(ctx, l)
142+
}
98143
}
99144
return nil
100145
}

0 commit comments

Comments
 (0)