Skip to content

Commit a6dbdee

Browse files
craig[bot]yuzefovich
andcommitted
106639: stmtdiagnostics: de-flake TestDiagnosticsRequest for good r=yuzefovich a=yuzefovich **stmtdiagnostics: fix possible race when polling is disabled** This commit fixes a possible race on `timeutil.Timer.C` that can happen when polling interval is disabled. In particular, the race would occur because disabling the polling `Stop`s the timer which puts it back into the pool which disallows further access to the `Timer` object. However, we already stored the reference to `Timer.C`, so we'd read from the channel which could be concurrently overwritten by another goroutine (that happened to pick up this timer object from the pool). To solve this race we now store the channel separately and explicitly nil it out after having stopped the timer (which makes it block forever, until the polling interval is changed). Additionally, this commit adds non-negative validation to the cluster setting since there is really no point in allowing negative durations for the polling interval. Release note: None **stmtdiagnostics: de-flake TestDiagnosticsRequest for good** This commit - hopefully - de-flakes TestDiagnosticsRequest for good. In particular, we recently merged a change to remove the request from the local registry _before_ inserting the bundle. This introduced a possible race between the bundle being inserted (which also marks the request as "completed") and the polling mechanism of the registry to populate itself: if the polling happens to happen before the request is marked as "completed", then the request will be reinserted into the registry which then fails the test (because it'll observe later than the request is complete, yet the request is still in the registry). To prevent flakes of this kind the test now disables the polling mechanism of the registry altogether - I don't think it is necessary since the test is manually inserting requests directly into the registry (in addition to writing them into the system table). I believe this polling mechanism was the reason for why we had to use SucceedsSoon in one of the test cases, so disabling the mechanism allows us to address an old TODO. (I believe what was happening rarely was that the request was canceled by the test case - which removes it from the registry, but then before the query is executed, the polling takes place, so the request is reinserted into the registry.) Fixes: cockroachdb#106582. Release note: None Co-authored-by: Yahor Yuzefovich <[email protected]>
2 parents 7b63c23 + 8973610 commit a6dbdee

File tree

2 files changed

+49
-54
lines changed

2 files changed

+49
-54
lines changed

pkg/sql/stmtdiagnostics/statement_diagnostics.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ var pollingInterval = settings.RegisterDurationSetting(
3636
settings.TenantReadOnly,
3737
"sql.stmt_diagnostics.poll_interval",
3838
"rate at which the stmtdiagnostics.Registry polls for requests, set to zero to disable",
39-
10*time.Second)
39+
10*time.Second,
40+
settings.NonNegativeDuration,
41+
)
4042

4143
var bundleChunkSize = settings.RegisterByteSizeSetting(
4244
settings.TenantWritable,
@@ -158,19 +160,27 @@ func (r *Registry) Start(ctx context.Context, stopper *stop.Stopper) {
158160

159161
func (r *Registry) poll(ctx context.Context) {
160162
var (
161-
timer timeutil.Timer
163+
timer timeutil.Timer
164+
// We need to store timer.C reference separately because timer.Stop()
165+
// (called when polling is disabled) puts timer into the pool and
166+
// prohibits further usage of stored timer.C.
167+
timerC = timer.C
162168
lastPoll time.Time
163169
deadline time.Time
164170
pollIntervalChanged = make(chan struct{}, 1)
165171
maybeResetTimer = func() {
166-
if interval := pollingInterval.Get(&r.st.SV); interval <= 0 {
167-
// Setting the interval to a non-positive value stops the polling.
172+
if interval := pollingInterval.Get(&r.st.SV); interval == 0 {
173+
// Setting the interval to zero stops the polling.
168174
timer.Stop()
175+
// nil out the channel so that it'd block forever in the loop
176+
// below (until the polling interval is changed).
177+
timerC = nil
169178
} else {
170179
newDeadline := lastPoll.Add(interval)
171180
if deadline.IsZero() || !deadline.Equal(newDeadline) {
172181
deadline = newDeadline
173182
timer.Reset(timeutil.Until(deadline))
183+
timerC = timer.C
174184
}
175185
}
176186
}
@@ -195,7 +205,7 @@ func (r *Registry) poll(ctx context.Context) {
195205
select {
196206
case <-pollIntervalChanged:
197207
continue // go back around and maybe reset the timer
198-
case <-timer.C:
208+
case <-timerC:
199209
timer.Read = true
200210
case <-ctx.Done():
201211
return

pkg/sql/stmtdiagnostics/statement_diagnostics_test.go

Lines changed: 34 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ func TestDiagnosticsRequest(t *testing.T) {
5050
_, err := db.Exec("CREATE TABLE test (x int PRIMARY KEY)")
5151
require.NoError(t, err)
5252

53+
// Disable polling interval since we're inserting requests directly into the
54+
// registry manually and want precise control of updating the registry.
55+
_, err = db.Exec("SET CLUSTER SETTING sql.stmt_diagnostics.poll_interval = '0';")
56+
require.NoError(t, err)
57+
5358
var collectUntilExpirationEnabled bool
5459
isCompleted := func(reqID int64) (completed bool, diagnosticsID gosql.NullInt64) {
5560
completedQuery := "SELECT completed, statement_diagnostics_id FROM system.statement_diagnostics_requests WHERE ID = $1"
@@ -76,28 +81,12 @@ func TestDiagnosticsRequest(t *testing.T) {
7681
require.True(t, completed)
7782
require.True(t, diagnosticsID.Valid)
7883
}
79-
// checkMaybeCompleted returns an error if 'completed' value for the given
80-
// request is different from expectedCompleted.
81-
checkMaybeCompleted := func(reqID int64, expectedCompleted bool) error {
82-
completed, diagnosticsID := isCompleted(reqID)
83-
if completed != expectedCompleted {
84-
return errors.Newf("expected completed to be %t, but found %t", expectedCompleted, completed)
85-
}
86-
// diagnosticsID is NULL when the request hasn't been completed yet.
87-
require.True(t, diagnosticsID.Valid == expectedCompleted)
88-
return nil
89-
}
9084
setCollectUntilExpiration := func(v bool) {
9185
collectUntilExpirationEnabled = v
9286
_, err := db.Exec(
9387
fmt.Sprintf("SET CLUSTER SETTING sql.stmt_diagnostics.collect_continuously.enabled = %t", v))
9488
require.NoError(t, err)
9589
}
96-
setPollInterval := func(d time.Duration) {
97-
_, err := db.Exec(
98-
fmt.Sprintf("SET CLUSTER SETTING sql.stmt_diagnostics.poll_interval = '%s'", d))
99-
require.NoError(t, err)
100-
}
10190

10291
var minExecutionLatency, expiresAfter time.Duration
10392
var samplingProbability float64
@@ -259,26 +248,21 @@ func TestDiagnosticsRequest(t *testing.T) {
259248
}
260249
for _, expiresAfter := range []time.Duration{0, time.Second} {
261250
t.Run(fmt.Sprintf("expiresAfter=%s", expiresAfter), func(t *testing.T) {
262-
// TODO(yuzefovich): for some reason occasionally the
263-
// bundle for the request is collected, so we use
264-
// SucceedsSoon. Figure it out.
265-
testutils.SucceedsSoon(t, func() error {
266-
reqID, err := registry.InsertRequestInternal(
267-
ctx, fprint, samplingProbability, minExecutionLatency, expiresAfter,
268-
)
269-
require.NoError(t, err)
270-
checkNotCompleted(reqID)
271-
272-
err = registry.CancelRequest(ctx, reqID)
273-
require.NoError(t, err)
274-
checkNotCompleted(reqID)
275-
276-
// Run the query that is slow enough to satisfy the
277-
// conditional request.
278-
_, err = db.Exec("SELECT pg_sleep(0.2)")
279-
require.NoError(t, err)
280-
return checkMaybeCompleted(reqID, false /* expectedCompleted */)
281-
})
251+
reqID, err := registry.InsertRequestInternal(
252+
ctx, fprint, samplingProbability, minExecutionLatency, expiresAfter,
253+
)
254+
require.NoError(t, err)
255+
checkNotCompleted(reqID)
256+
257+
err = registry.CancelRequest(ctx, reqID)
258+
require.NoError(t, err)
259+
checkNotCompleted(reqID)
260+
261+
// Run the query that is slow enough to satisfy the
262+
// conditional request.
263+
_, err = db.Exec("SELECT pg_sleep(0.2)")
264+
require.NoError(t, err)
265+
checkNotCompleted(reqID)
282266
})
283267
}
284268
})
@@ -324,7 +308,13 @@ func TestDiagnosticsRequest(t *testing.T) {
324308
require.NoError(t, err)
325309

326310
wg.Wait()
327-
return checkMaybeCompleted(reqID, true /* expectedCompleted */)
311+
312+
completed, diagnosticsID := isCompleted(reqID)
313+
if !completed {
314+
return errors.New("expected request to be completed")
315+
}
316+
require.True(t, diagnosticsID.Valid)
317+
return nil
328318
})
329319
})
330320
}
@@ -451,21 +441,16 @@ func TestDiagnosticsRequest(t *testing.T) {
451441
// Sleep until expiration (and then some), and then run the query.
452442
time.Sleep(expiresAfter + 100*time.Millisecond)
453443

454-
setPollInterval(10 * time.Millisecond)
455-
defer setPollInterval(stmtdiagnostics.PollingInterval.Default())
456-
457-
// We should not find the request and a subsequent executions should not
458-
// capture anything.
459-
testutils.SucceedsSoon(t, func() error {
460-
if found := registry.TestingFindRequest(reqID); found {
461-
return errors.New("expected expired request to no longer be tracked")
462-
}
463-
return nil
464-
})
465-
444+
// Even though the request has expired, it hasn't been removed from the
445+
// registry yet (because we disabled the polling interval). When we run
446+
// the query that matches the fingerprint, the expired request is
447+
// removed, and the bundle is not collected.
466448
_, err = db.Exec("SELECT pg_sleep(0.01)") // run the query
467449
require.NoError(t, err)
468450
checkNotCompleted(reqID)
451+
452+
// Sanity check that the request is no longer in the registry.
453+
require.False(t, registry.TestingFindRequest(reqID))
469454
})
470455
}
471456

0 commit comments

Comments
 (0)