roachtest: modernize gracefuldraining test

Andrew Baptist · Andrew Baptist · commit cf30717653ea · 2023-08-01T17:06:57.000-04:00
Previously the test would attempt to determine the throughput based on the metrics timeseries web API during the test. This caused a high variance even though the test runner was showing lower variance. This change updates to query against the `crdb_internal.node_metrics` table during the test and produces more reliable results. Epic: none Informs: cockroachdb#106490 Release note: None
diff --git a/pkg/cmd/roachtest/tests/kv.go b/pkg/cmd/roachtest/tests/kv.go
@@ -15,15 +15,13 @@ import (
 	gosql "database/sql"
 	"fmt"
 	"math/rand"
-	"net/http"
 	"os"
 	"strconv"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
 
-	"github.com/cockroachdb/cockroach/pkg/base"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
@@ -32,9 +30,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
-	"github.com/cockroachdb/cockroach/pkg/ts/tspb"
-	"github.com/cockroachdb/cockroach/pkg/util/httputil"
-	"github.com/cockroachdb/cockroach/pkg/util/retry"
+	"github.com/cockroachdb/cockroach/pkg/testutils"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 	"github.com/cockroachdb/errors"
 	"github.com/stretchr/testify/assert"
@@ -518,50 +514,52 @@ func registerKVGracefulDraining(r registry.Registry) {
 		Cluster: r.MakeClusterSpec(4),
 		Leases:  registry.MetamorphicLeases,
 		Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
+			c.Put(ctx, t.Cockroach(), "./cockroach", c.Range(1, c.Spec().NodeCount))
 			nodes := c.Spec().NodeCount - 1
-			c.Put(ctx, t.Cockroach(), "./cockroach", c.Range(1, nodes))
-			c.Put(ctx, t.DeprecatedWorkload(), "./workload", c.Node(nodes+1))
 
 			t.Status("starting cluster")
-
 			// If the test ever fails, the person who investigates the
 			// failure will likely be thankful for this additional logging.
 			startOpts := option.DefaultStartOpts()
 			startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs, "--vmodule=store=2,store_rebalancer=2")
 			c.Start(ctx, t.L(), startOpts, install.MakeClusterSettings(), c.Range(1, nodes))
 
-			db := c.Conn(ctx, t.L(), 1)
-			defer db.Close()
+			db1 := c.Conn(ctx, t.L(), 1)
+			defer db1.Close()
+			db2 := c.Conn(ctx, t.L(), 2)
+			defer db2.Close()
 
-			err := WaitFor3XReplication(ctx, t, db)
+			err := WaitFor3XReplication(ctx, t, db1)
 			require.NoError(t, err)
 
 			t.Status("initializing workload")
 
 			// Initialize the database with a lot of ranges so that there are
 			// definitely a large number of leases on the node that we shut down
 			// before it starts draining.
-			splitCmd := "./workload run kv --init --max-ops=1 --splits 100 {pgurl:1}"
-			c.Run(ctx, c.Node(nodes+1), splitCmd)
+			c.Run(ctx, c.Node(1), "./cockroach workload init kv --splits 100")
 
 			m := c.NewMonitor(ctx, c.Nodes(1, nodes))
+			m.ExpectDeath()
 
 			// specifiedQPS is going to be the --max-rate for the kv workload.
-			const specifiedQPS = 1000
+			specifiedQPS := 1000
+			if c.IsLocal() {
+				specifiedQPS = 100
+			}
 			// Because we're specifying a --max-rate well less than what cockroach
 			// should be capable of, draining one of the three nodes should have no
 			// effect on performance at all, meaning that a fairly aggressive
 			// threshold here should be ok.
-			expectedQPS := specifiedQPS * 0.9
+			expectedQPS := float64(specifiedQPS) * .9
 
 			t.Status("starting workload")
 			workloadStartTime := timeutil.Now()
 			desiredRunDuration := 5 * time.Minute
 			m.Go(func(ctx context.Context) error {
 				cmd := fmt.Sprintf(
-					"./workload run kv --duration=%s --read-percent=0 --tolerate-errors --max-rate=%d {pgurl:1-%d}",
-					desiredRunDuration,
-					specifiedQPS, nodes-1)
+					"./cockroach workload run kv --duration=%s --read-percent=0 --max-rate=%d {pgurl:1-%d}",
+					desiredRunDuration, specifiedQPS, nodes-1)
 				t.WorkerStatus(cmd)
 				defer func() {
 					t.WorkerStatus("workload command completed")
@@ -570,152 +568,81 @@ func registerKVGracefulDraining(r registry.Registry) {
 				return c.RunE(ctx, c.Node(nodes+1), cmd)
 			})
 
-			m.Go(func(ctx context.Context) error {
-				defer t.WorkerStatus()
-
-				t.WorkerStatus("waiting for perf to stabilize")
-				// Before we start shutting down nodes, wait for the performance
-				// of the workload to stabilize at the expected allowed level.
-
-				adminURLs, err := c.ExternalAdminUIAddr(ctx, t.L(), c.Node(1))
-				if err != nil {
-					return err
-				}
-				url := "http://" + adminURLs[0] + "/ts/query"
-				getQPSTimeSeries := func(start, end time.Time) ([]tspb.TimeSeriesDatapoint, error) {
-					request := tspb.TimeSeriesQueryRequest{
-						StartNanos: start.UnixNano(),
-						EndNanos:   end.UnixNano(),
-						// Check the performance in each timeseries sample interval.
-						SampleNanos: base.DefaultMetricsSampleInterval.Nanoseconds(),
-						Queries: []tspb.Query{
-							{
-								Name:             "cr.node.sql.query.count",
-								Downsampler:      tspb.TimeSeriesQueryAggregator_AVG.Enum(),
-								SourceAggregator: tspb.TimeSeriesQueryAggregator_SUM.Enum(),
-								Derivative:       tspb.TimeSeriesQueryDerivative_NON_NEGATIVE_DERIVATIVE.Enum(),
-							},
-						},
-					}
-					var response tspb.TimeSeriesQueryResponse
-					if err := httputil.PostJSON(http.Client{}, url, &request, &response); err != nil {
-						return nil, err
-					}
-					if len(response.Results[0].Datapoints) <= 1 {
-						return nil, errors.Newf("not enough datapoints in timeseries query response: %+v", response)
-					}
-					return response.Results[0].Datapoints, nil
+			verifyQPS := func(ctx context.Context) error {
+				if qps := measureQPS(ctx, t, time.Second, db1, db2); qps < expectedQPS {
+					return errors.Newf(
+						"QPS of %.2f at time %v is below minimum allowable QPS of %.2f",
+						qps, timeutil.Now(), expectedQPS)
 				}
+				return nil
+			}
 
-				waitBegin := timeutil.Now()
-				// Nb: we could want to use testutil.SucceedSoonError() here,
-				// however that has a hardcoded timeout of 45 seconds, and
-				// empirically we see this loop needs ~40 seconds to get enough
-				// samples to succeed. This would be too close to call, so
-				// we're using our own timeout instead.
-				if err := retry.ForDuration(1*time.Minute, func() (err error) {
-					defer func() {
-						if timeutil.Since(waitBegin) > 3*time.Second && err != nil {
-							t.Status(fmt.Sprintf("perf not stable yet: %v", err))
-						}
-					}()
-					now := timeutil.Now()
-					datapoints, err := getQPSTimeSeries(workloadStartTime, now)
-					if err != nil {
-						return err
-					}
+			t.Status("waiting for perf to stabilize")
+			testutils.SucceedsSoon(t, func() error { return verifyQPS(ctx) })
 
-					// Examine the last data point. As the retry.ForDuration loop
-					// iterates, this will only consider the last 10 seconds of
-					// measurement.
-					dp := datapoints[len(datapoints)-1]
-					if qps := dp.Value; qps < expectedQPS {
-						return errors.Newf(
-							"QPS of %.2f at time %v is below minimum allowable QPS of %.2f; entire timeseries: %+v",
-							qps, timeutil.Unix(0, dp.TimestampNanos), expectedQPS, datapoints)
+			// Begin the monitoring goroutine to track QPS every second.
+			m.Go(func(ctx context.Context) error {
+				t.Status("starting watcher to verify QPS during the test")
+				defer t.WorkerStatus()
+				for {
+					// Measure QPS every second throughout the test. verifyQPS takes time
+					// to run so we don't sleep between invocations.
+					require.NoError(t, verifyQPS(ctx))
+					// Stop measuring 10 seconds before we stop the workload.
+					if timeutil.Since(workloadStartTime) > desiredRunDuration-10*time.Second {
+						return nil
 					}
-
-					// The desired performance has been reached by the
-					// workload. We're ready to start exercising shutdowns.
-					return nil
-				}); err != nil {
-					t.Fatal(err)
 				}
-				t.Status("detected stable perf before restarts: OK")
-
-				// The time at which we know the performance has become stable already.
-				stablePerfStartTime := timeutil.Now()
-
-				t.WorkerStatus("gracefully draining and restarting nodes")
-				// Gracefully shut down the third node, let the cluster run for a
-				// while, then restart it. Then repeat for good measure.
-				for i := 0; i < 2; i++ {
-					if i > 0 {
-						// No need to wait extra during the first iteration: we
-						// have already waited for the perf to become stable
-						// above.
-						t.Status("letting workload run with all nodes")
-						select {
-						case <-ctx.Done():
-							return nil
-						case <-time.After(1 * time.Minute):
-						}
-					}
-					m.ExpectDeath()
-					// Graceful drain: send SIGTERM, which should be sufficient
-					// to stop the node, followed by a non-graceful SIGKILL a
-					// bit later to clean up should the process have become
-					// stuck.
-					stopOpts := option.DefaultStopOpts()
-					stopOpts.RoachprodOpts.Sig = 15
-					stopOpts.RoachprodOpts.Wait = true
-					stopOpts.RoachprodOpts.MaxWait = 30
-					c.Stop(ctx, t.L(), stopOpts, c.Node(nodes))
-					c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(nodes))
-					t.Status("letting workload run with one node down")
+			})
+
+			t.Status("gracefully draining and restarting nodes")
+			// Gracefully shut down the third node, let the cluster run for a
+			// while, then restart it. Then repeat for good measure.
+			for i := 0; i < 2; i++ {
+				if i > 0 {
+					// No need to wait extra during the first iteration: we
+					// have already waited for the perf to become stable
+					// above.
+					t.Status("letting workload run with all nodes")
 					select {
 					case <-ctx.Done():
-						return nil
+						return
 					case <-time.After(1 * time.Minute):
 					}
-					c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.Node(nodes))
-					m.ResetDeaths()
 				}
-
-				// Let the test run for nearly the entire duration of the kv command.
-				// The key is that we want the workload command to still be running when
-				// we look at the performance below. Given that the workload was set
-				// to run for 5 minutes, we should be fine here, however we want to guarantee
-				// there's at least 10s left to go. Check this.
-				t.WorkerStatus("checking workload is still running")
-				runDuration := timeutil.Since(workloadStartTime)
-				if runDuration > desiredRunDuration-10*time.Second {
-					t.Fatalf("not enough workload time left to reliably determine performance (%s left)",
-						desiredRunDuration-runDuration)
-				}
-
-				t.WorkerStatus("checking for perf throughout the test")
-
-				// Check that the QPS has been at the expected max rate for the entire
-				// test duration, even as one of the nodes was being stopped and started.
-				endTestTime := timeutil.Now()
-				datapoints, err := getQPSTimeSeries(stablePerfStartTime, endTestTime)
-				if err != nil {
-					t.Fatal(err)
+				// Graceful drain: send SIGTERM, which should be sufficient
+				// to stop the node, followed by a non-graceful SIGKILL a
+				// bit later to clean up should the process have become
+				// stuck.
+				stopOpts := option.DefaultStopOpts()
+				stopOpts.RoachprodOpts.Sig = 15
+				stopOpts.RoachprodOpts.Wait = true
+				stopOpts.RoachprodOpts.MaxWait = 30
+				c.Stop(ctx, t.L(), stopOpts, c.Node(nodes))
+				c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(nodes))
+				t.Status("letting workload run with one node down")
+				select {
+				case <-ctx.Done():
+					return
+				case <-time.After(1 * time.Minute):
 				}
+				c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.Node(nodes))
+				m.ResetDeaths()
+			}
 
-				for _, dp := range datapoints {
-					if qps := dp.Value; qps < expectedQPS {
-						t.Fatalf(
-							"QPS of %.2f at time %v is below minimum allowable QPS of %.2f; entire timeseries: %+v",
-							qps, timeutil.Unix(0, dp.TimestampNanos), expectedQPS, datapoints)
-					}
-				}
-				t.Status("perf is OK!")
-				t.WorkerStatus("waiting for workload to complete")
-				return nil
-			})
+			// Let the test run for nearly the entire duration of the kv command.
+			// The key is that we want the workload command to still be running when
+			// we look at the performance below. Given that the workload was set
+			// to run for 5 minutes, we should be fine here, however we want to guarantee
+			// there's at least 10s left to go. Check this.
+			t.Status("checking workload is still running")
+			runDuration := timeutil.Since(workloadStartTime)
+			if runDuration > desiredRunDuration-10*time.Second {
+				t.Fatalf("not enough workload time left to reliably determine performance (%s left)",
+					desiredRunDuration-runDuration)
+			}
 
+			t.Status("waiting for workload to complete")
 			m.Wait()
 		},
 	})