Skip to content

Commit 47672d2

Browse files
craig[bot]andyyang890
andcommitted
Merge #147802
147802: roachtest: fix race in cdc/bank test r=asg0451 a=andyyang890 This patch fixes an issue where the `cdc/bank` test could unexpectedly fatal if the kafka chaos loop was in the middle of a stop/restart after one of the test's other goroutines completed. This was happening because the other goroutine was canceling the monitor's context, which is shared by all goroutines, instead of just signaling the goroutine running the bank workload to stop. The test has been modified so that each goroutine that needs to be canceled has its own cancel function and the kafka chaos loop no longer immediately fatals on context cancellation. Fixes #147742 Fixes #147820 Release note: None Co-authored-by: Andy Yang <[email protected]>
2 parents de6ca87 + 245f611 commit 47672d2

File tree

1 file changed

+56
-24
lines changed

1 file changed

+56
-24
lines changed

pkg/cmd/roachtest/tests/cdc.go

Lines changed: 56 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ import (
3535
"strconv"
3636
"strings"
3737
"sync"
38-
"sync/atomic"
3938
"time"
4039

4140
"github.com/IBM/sarama"
@@ -876,7 +875,7 @@ func runCDCBank(ctx context.Context, t test.Test, c cluster.Cluster, cfg cdcBank
876875
`SET CLUSTER SETTING changefeed.span_checkpoint.lag_threshold = '1us'`,
877876
} {
878877
if _, err := db.Exec(stmt); err != nil {
879-
t.Fatal(err)
878+
t.Fatal(fmt.Sprintf("failed to execute stmt %q: %v", stmt, err))
880879
}
881880
}
882881
}
@@ -910,33 +909,35 @@ func runCDCBank(ctx context.Context, t test.Test, c cluster.Cluster, cfg cdcBank
910909
defer l.Close()
911910

912911
t.Status("running workload")
913-
workloadCtx, workloadCancel := context.WithCancel(ctx)
914-
defer workloadCancel()
915912

916-
m := c.NewMonitor(workloadCtx, crdbNodes)
917-
var doneAtomic int64
918913
messageBuf := make(chan *sarama.ConsumerMessage, 4096)
919914
const requestedResolved = 100
920-
if cfg.kafkaChaos {
921-
m.Go(func(ctx context.Context) error {
915+
916+
m := c.NewMonitor(ctx, crdbNodes)
917+
chaosCancel := func() func() {
918+
if !cfg.kafkaChaos {
919+
return func() {}
920+
}
921+
return m.GoWithCancel(func(ctx context.Context) error {
922922
period, downTime := time.Minute, 10*time.Second
923923
err := kafka.chaosLoop(ctx, period, downTime, nil)
924-
if atomic.LoadInt64(&doneAtomic) > 0 {
924+
if errors.Is(err, context.Canceled) {
925925
return nil
926926
}
927927
return errors.Wrap(err, "kafka chaos loop failed")
928928
})
929-
}
930-
m.Go(func(ctx context.Context) error {
929+
}()
930+
workloadCancel := m.GoWithCancel(func(ctx context.Context) error {
931931
err := c.RunE(ctx, option.WithNodes(workloadNode), `./cockroach workload run bank {pgurl:1} --max-rate=10`)
932-
if atomic.LoadInt64(&doneAtomic) > 0 {
932+
if errors.Is(err, context.Canceled) {
933933
return nil
934934
}
935935
return errors.Wrap(err, "workload failed")
936936
})
937937
m.Go(func(ctx context.Context) error {
938+
defer chaosCancel()
938939
defer workloadCancel()
939-
defer func() { close(messageBuf) }()
940+
defer close(messageBuf)
940941
v := cdctest.NewCountValidator(cdctest.NoOpValidator)
941942
for {
942943
m, err := tc.next(ctx)
@@ -961,7 +962,6 @@ func runCDCBank(ctx context.Context, t test.Test, c cluster.Cluster, cfg cdcBank
961962
l.Printf("%d of %d resolved timestamps received from kafka, latest is %s behind realtime, %s beind realtime when sent to kafka",
962963
v.NumResolvedWithRows, requestedResolved, timeutil.Since(resolved.GoTime()), m.Timestamp.Sub(resolved.GoTime()))
963964
if v.NumResolvedWithRows >= requestedResolved {
964-
atomic.StoreInt64(&doneAtomic, 1)
965965
break
966966
}
967967
}
@@ -3471,9 +3471,17 @@ var kafkaServices = map[string][]string{
34713471
}
34723472

34733473
func (k kafkaManager) restart(ctx context.Context, targetService string, envVars ...string) {
3474+
if err := k.restartE(ctx, targetService, envVars...); err != nil {
3475+
k.t.Fatal(err)
3476+
}
3477+
}
3478+
3479+
func (k kafkaManager) restartE(ctx context.Context, targetService string, envVars ...string) error {
34743480
services := kafkaServices[targetService]
34753481

3476-
k.c.Run(ctx, option.WithNodes(k.kafkaSinkNodes), "touch", k.serverJAASConfig())
3482+
if err := k.c.RunE(ctx, option.WithNodes(k.kafkaSinkNodes), "touch", k.serverJAASConfig()); err != nil {
3483+
return err
3484+
}
34773485
for _, svcName := range services {
34783486
// The confluent tool applies the KAFKA_OPTS to all
34793487
// services. Also, the kafka.logs.dir is used by each
@@ -3492,12 +3500,21 @@ func (k kafkaManager) restart(ctx context.Context, targetService string, envVars
34923500
startCmd += fmt.Sprintf(" %s local services %s start", k.confluentBin(), svcName)
34933501

34943502
// Sometimes kafka wants to be difficult and not start back up first try. Give it some time.
3495-
k.c.Run(ctx, option.WithNodes(k.kafkaSinkNodes).WithRetryOpts(retry.Options{
3496-
InitialBackoff: 5 * time.Second,
3497-
MaxBackoff: 30 * time.Second,
3498-
MaxRetries: 30,
3499-
}).WithShouldRetryFn(func(*install.RunResultDetails) bool { return true }), startCmd)
3503+
if err := k.c.RunE(ctx, option.
3504+
WithNodes(k.kafkaSinkNodes).
3505+
WithRetryOpts(retry.Options{
3506+
InitialBackoff: 5 * time.Second,
3507+
MaxBackoff: 30 * time.Second,
3508+
MaxRetries: 30,
3509+
}).
3510+
WithShouldRetryFn(func(*install.RunResultDetails) bool { return true }),
3511+
startCmd,
3512+
); err != nil {
3513+
return err
3514+
}
35003515
}
3516+
3517+
return nil
35013518
}
35023519

35033520
func (k kafkaManager) makeCommand(exe string, args ...string) string {
@@ -3509,8 +3526,19 @@ func (k kafkaManager) makeCommand(exe string, args ...string) string {
35093526
}
35103527

35113528
func (k kafkaManager) stop(ctx context.Context) {
3512-
k.c.Run(ctx, option.WithNodes(k.kafkaSinkNodes), fmt.Sprintf("rm -f %s", k.serverJAASConfig()))
3513-
k.c.Run(ctx, option.WithNodes(k.kafkaSinkNodes), k.makeCommand("confluent", "local services stop"))
3529+
if err := k.stopE(ctx); err != nil {
3530+
k.t.Fatal(err)
3531+
}
3532+
}
3533+
3534+
func (k kafkaManager) stopE(ctx context.Context) error {
3535+
if err := k.c.RunE(ctx, option.WithNodes(k.kafkaSinkNodes), fmt.Sprintf("rm -f %s", k.serverJAASConfig())); err != nil {
3536+
return err
3537+
}
3538+
if err := k.c.RunE(ctx, option.WithNodes(k.kafkaSinkNodes), k.makeCommand("confluent", "local services stop")); err != nil {
3539+
return err
3540+
}
3541+
return nil
35143542
}
35153543

35163544
func (k kafkaManager) chaosLoop(
@@ -3528,7 +3556,9 @@ func (k kafkaManager) chaosLoop(
35283556
}
35293557

35303558
k.t.L().Printf("kafka chaos loop iteration %d: stopping", i)
3531-
k.stop(ctx)
3559+
if err := k.stopE(ctx); err != nil {
3560+
return err
3561+
}
35323562

35333563
select {
35343564
case <-stopper:
@@ -3539,7 +3569,9 @@ func (k kafkaManager) chaosLoop(
35393569
}
35403570

35413571
k.t.L().Printf("kafka chaos loop iteration %d: restarting", i)
3542-
k.restart(ctx, "kafka")
3572+
if err := k.restartE(ctx, "kafka"); err != nil {
3573+
return err
3574+
}
35433575
}
35443576
}
35453577

0 commit comments

Comments
 (0)