Merge cockroachdb#106639

craig[bot] · yuzefovich · craig[bot] · commit a6dbdee0e073 · 2023-07-12T04:14:18.000Z
106639: stmtdiagnostics: de-flake TestDiagnosticsRequest for good r=yuzefovich a=yuzefovich **stmtdiagnostics: fix possible race when polling is disabled** This commit fixes a possible race on `timeutil.Timer.C` that can happen when polling interval is disabled. In particular, the race would occur because disabling the polling `Stop`s the timer which puts it back into the pool which disallows further access to the `Timer` object. However, we already stored the reference to `Timer.C`, so we'd read from the channel which could be concurrently overwritten by another goroutine (that happened to pick up this timer object from the pool). To solve this race we now store the channel separately and explicitly nil it out after having stopped the timer (which makes it block forever, until the polling interval is changed). Additionally, this commit adds non-negative validation to the cluster setting since there is really no point in allowing negative durations for the polling interval. Release note: None **stmtdiagnostics: de-flake TestDiagnosticsRequest for good** This commit - hopefully - de-flakes TestDiagnosticsRequest for good. In particular, we recently merged a change to remove the request from the local registry _before_ inserting the bundle. This introduced a possible race between the bundle being inserted (which also marks the request as "completed") and the polling mechanism of the registry to populate itself: if the polling happens to happen before the request is marked as "completed", then the request will be reinserted into the registry which then fails the test (because it'll observe later than the request is complete, yet the request is still in the registry). To prevent flakes of this kind the test now disables the polling mechanism of the registry altogether - I don't think it is necessary since the test is manually inserting requests directly into the registry (in addition to writing them into the system table). I believe this polling mechanism was the reason for why we had to use SucceedsSoon in one of the test cases, so disabling the mechanism allows us to address an old TODO. (I believe what was happening rarely was that the request was canceled by the test case - which removes it from the registry, but then before the query is executed, the polling takes place, so the request is reinserted into the registry.) Fixes: cockroachdb#106582. Release note: None Co-authored-by: Yahor Yuzefovich <yahor@cockroachlabs.com>
diff --git a/pkg/sql/stmtdiagnostics/statement_diagnostics.go b/pkg/sql/stmtdiagnostics/statement_diagnostics.go
@@ -36,7 +36,9 @@ var pollingInterval = settings.RegisterDurationSetting(
 	settings.TenantReadOnly,
 	"sql.stmt_diagnostics.poll_interval",
 	"rate at which the stmtdiagnostics.Registry polls for requests, set to zero to disable",
-	10*time.Second)
+	10*time.Second,
+	settings.NonNegativeDuration,
+)
 
 var bundleChunkSize = settings.RegisterByteSizeSetting(
 	settings.TenantWritable,
@@ -158,19 +160,27 @@ func (r *Registry) Start(ctx context.Context, stopper *stop.Stopper) {
 
 func (r *Registry) poll(ctx context.Context) {
 	var (
-		timer               timeutil.Timer
+		timer timeutil.Timer
+		// We need to store timer.C reference separately because timer.Stop()
+		// (called when polling is disabled) puts timer into the pool and
+		// prohibits further usage of stored timer.C.
+		timerC              = timer.C
 		lastPoll            time.Time
 		deadline            time.Time
 		pollIntervalChanged = make(chan struct{}, 1)
 		maybeResetTimer     = func() {
-			if interval := pollingInterval.Get(&r.st.SV); interval <= 0 {
-				// Setting the interval to a non-positive value stops the polling.
+			if interval := pollingInterval.Get(&r.st.SV); interval == 0 {
+				// Setting the interval to zero stops the polling.
 				timer.Stop()
+				// nil out the channel so that it'd block forever in the loop
+				// below (until the polling interval is changed).
+				timerC = nil
 			} else {
 				newDeadline := lastPoll.Add(interval)
 				if deadline.IsZero() || !deadline.Equal(newDeadline) {
 					deadline = newDeadline
 					timer.Reset(timeutil.Until(deadline))
+					timerC = timer.C
 				}
 			}
 		}
@@ -195,7 +205,7 @@ func (r *Registry) poll(ctx context.Context) {
 		select {
 		case <-pollIntervalChanged:
 			continue // go back around and maybe reset the timer
-		case <-timer.C:
+		case <-timerC:
 			timer.Read = true
 		case <-ctx.Done():
 			return
diff --git a/pkg/sql/stmtdiagnostics/statement_diagnostics_test.go b/pkg/sql/stmtdiagnostics/statement_diagnostics_test.go
@@ -50,6 +50,11 @@ func TestDiagnosticsRequest(t *testing.T) {
 	_, err := db.Exec("CREATE TABLE test (x int PRIMARY KEY)")
 	require.NoError(t, err)
 
+	// Disable polling interval since we're inserting requests directly into the
+	// registry manually and want precise control of updating the registry.
+	_, err = db.Exec("SET CLUSTER SETTING sql.stmt_diagnostics.poll_interval = '0';")
+	require.NoError(t, err)
+
 	var collectUntilExpirationEnabled bool
 	isCompleted := func(reqID int64) (completed bool, diagnosticsID gosql.NullInt64) {
 		completedQuery := "SELECT completed, statement_diagnostics_id FROM system.statement_diagnostics_requests WHERE ID = $1"
@@ -76,28 +81,12 @@ func TestDiagnosticsRequest(t *testing.T) {
 		require.True(t, completed)
 		require.True(t, diagnosticsID.Valid)
 	}
-	// checkMaybeCompleted returns an error if 'completed' value for the given
-	// request is different from expectedCompleted.
-	checkMaybeCompleted := func(reqID int64, expectedCompleted bool) error {
-		completed, diagnosticsID := isCompleted(reqID)
-		if completed != expectedCompleted {
-			return errors.Newf("expected completed to be %t, but found %t", expectedCompleted, completed)
-		}
-		// diagnosticsID is NULL when the request hasn't been completed yet.
-		require.True(t, diagnosticsID.Valid == expectedCompleted)
-		return nil
-	}
 	setCollectUntilExpiration := func(v bool) {
 		collectUntilExpirationEnabled = v
 		_, err := db.Exec(
 			fmt.Sprintf("SET CLUSTER SETTING sql.stmt_diagnostics.collect_continuously.enabled = %t", v))
 		require.NoError(t, err)
 	}
-	setPollInterval := func(d time.Duration) {
-		_, err := db.Exec(
-			fmt.Sprintf("SET CLUSTER SETTING sql.stmt_diagnostics.poll_interval = '%s'", d))
-		require.NoError(t, err)
-	}
 
 	var minExecutionLatency, expiresAfter time.Duration
 	var samplingProbability float64
@@ -259,26 +248,21 @@ func TestDiagnosticsRequest(t *testing.T) {
 				}
 				for _, expiresAfter := range []time.Duration{0, time.Second} {
 					t.Run(fmt.Sprintf("expiresAfter=%s", expiresAfter), func(t *testing.T) {
-						// TODO(yuzefovich): for some reason occasionally the
-						// bundle for the request is collected, so we use
-						// SucceedsSoon. Figure it out.
-						testutils.SucceedsSoon(t, func() error {
-							reqID, err := registry.InsertRequestInternal(
-								ctx, fprint, samplingProbability, minExecutionLatency, expiresAfter,
-							)
-							require.NoError(t, err)
-							checkNotCompleted(reqID)
-
-							err = registry.CancelRequest(ctx, reqID)
-							require.NoError(t, err)
-							checkNotCompleted(reqID)
-
-							// Run the query that is slow enough to satisfy the
-							// conditional request.
-							_, err = db.Exec("SELECT pg_sleep(0.2)")
-							require.NoError(t, err)
-							return checkMaybeCompleted(reqID, false /* expectedCompleted */)
-						})
+						reqID, err := registry.InsertRequestInternal(
+							ctx, fprint, samplingProbability, minExecutionLatency, expiresAfter,
+						)
+						require.NoError(t, err)
+						checkNotCompleted(reqID)
+
+						err = registry.CancelRequest(ctx, reqID)
+						require.NoError(t, err)
+						checkNotCompleted(reqID)
+
+						// Run the query that is slow enough to satisfy the
+						// conditional request.
+						_, err = db.Exec("SELECT pg_sleep(0.2)")
+						require.NoError(t, err)
+						checkNotCompleted(reqID)
 					})
 				}
 			})
@@ -324,7 +308,13 @@ func TestDiagnosticsRequest(t *testing.T) {
 					require.NoError(t, err)
 
 					wg.Wait()
-					return checkMaybeCompleted(reqID, true /* expectedCompleted */)
+
+					completed, diagnosticsID := isCompleted(reqID)
+					if !completed {
+						return errors.New("expected request to be completed")
+					}
+					require.True(t, diagnosticsID.Valid)
+					return nil
 				})
 			})
 		}
@@ -451,21 +441,16 @@ func TestDiagnosticsRequest(t *testing.T) {
 		// Sleep until expiration (and then some), and then run the query.
 		time.Sleep(expiresAfter + 100*time.Millisecond)
 
-		setPollInterval(10 * time.Millisecond)
-		defer setPollInterval(stmtdiagnostics.PollingInterval.Default())
-
-		// We should not find the request and a subsequent executions should not
-		// capture anything.
-		testutils.SucceedsSoon(t, func() error {
-			if found := registry.TestingFindRequest(reqID); found {
-				return errors.New("expected expired request to no longer be tracked")
-			}
-			return nil
-		})
-
+		// Even though the request has expired, it hasn't been removed from the
+		// registry yet (because we disabled the polling interval). When we run
+		// the query that matches the fingerprint, the expired request is
+		// removed, and the bundle is not collected.
 		_, err = db.Exec("SELECT pg_sleep(0.01)") // run the query
 		require.NoError(t, err)
 		checkNotCompleted(reqID)
+
+		// Sanity check that the request is no longer in the registry.
+		require.False(t, registry.TestingFindRequest(reqID))
 	})
 }