Skip to content

Commit 682be9d

Browse files
committed
insights: fix flaky TestInsightsIntegrationForContention
`TestInsightsIntegrationForContention` is flaky due to checking for an expected contention insight event that in rare cases may not be generated. Previously, this test was not attempting to enforce that the blocking transaction blocks the expected query for the required amount of time. The test sets the insight latency threshold to 30ms, however in rare cases it's possible that the waiting query finishes executing in that time. This fix adds a wait group s.t. the blocking txn will wait for the waiting txn to begin executing, then sleep for a duration of 500ms. This should ensure we hit the 30ms threshold required by insights. Epic: none Fixes: cockroachdb#108368 Release note: None
1 parent 4c39761 commit 682be9d

File tree

2 files changed

+43
-44
lines changed

2 files changed

+43
-44
lines changed

pkg/sql/sqlstats/insights/integration/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ go_test(
1414
"//pkg/testutils",
1515
"//pkg/testutils/serverutils",
1616
"//pkg/testutils/skip",
17+
"//pkg/testutils/sqlutils",
1718
"//pkg/testutils/testcluster",
1819
"//pkg/util/leaktest",
1920
"//pkg/util/log",

pkg/sql/sqlstats/insights/integration/insights_test.go

Lines changed: 42 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
"math"
1818
"os"
1919
"strings"
20-
"sync"
2120
"testing"
2221
"time"
2322

@@ -30,6 +29,7 @@ import (
3029
"github.com/cockroachdb/cockroach/pkg/testutils"
3130
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
3231
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
32+
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
3333
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
3434
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
3535
"github.com/cockroachdb/cockroach/pkg/util/log"
@@ -509,62 +509,60 @@ func TestInsightsIntegrationForContention(t *testing.T) {
509509
args := base.TestClusterArgs{ServerArgs: base.TestServerArgs{Settings: settings}}
510510
tc := testcluster.StartTestCluster(t, 1, args)
511511
defer tc.Stopper().Stop(ctx)
512-
conn := tc.ServerConn(0)
513512

514-
_, err := conn.Exec("SET tracing = true;")
515-
require.NoError(t, err)
516-
_, err = conn.Exec("SET cluster setting sql.txn_stats.sample_rate = 1;")
517-
require.NoError(t, err)
513+
conn := sqlutils.MakeSQLRunner(tc.ServerConn(0))
514+
// This connection will ensure the setting is changed for secondary tenant.
515+
516+
conn.Exec(t, "SET tracing = true;")
517+
conn.Exec(t, "SET cluster setting sql.txn_stats.sample_rate = 1;")
518518
// Reduce the resolution interval to speed up the test.
519-
_, err = conn.Exec(
520-
`SET CLUSTER SETTING sql.contention.event_store.resolution_interval = '100ms'`)
521-
require.NoError(t, err)
522-
_, err = conn.Exec("CREATE TABLE t (id string PRIMARY KEY, s string);")
523-
require.NoError(t, err)
519+
conn.Exec(t, `SET CLUSTER SETTING sql.contention.event_store.resolution_interval = '100ms'`)
524520

525-
// Enable detection by setting a latencyThreshold > 0.
526-
latencyThreshold := 30 * time.Millisecond
527-
insights.LatencyThreshold.Override(ctx, &settings.SV, latencyThreshold)
521+
// Set the insights detection threshold lower.
522+
conn.Exec(t, "SET CLUSTER SETTING sql.insights.latency_threshold = '30ms'")
523+
524+
conn.Exec(t, "CREATE TABLE t (id string PRIMARY KEY, s string);")
525+
526+
// Create a new connection, and then start a transaction, update a row, sleep for a time,
527+
// and then complete the transaction. In a separate go routine attempt to update the same
528+
//row being updated concurrently, this will be blocked until the original transaction completes.
528529

529-
// Create a new connection, and then in a go routine have it start a transaction, update a row,
530-
// sleep for a time, and then complete the transaction.
531-
// With original connection attempt to update the same row being updated concurrently
532-
// in the separate go routine, this will be blocked until the original transaction completes.
533-
var wgTxnStarted sync.WaitGroup
534-
wgTxnStarted.Add(1)
530+
// Chan to wait for the txn to complete to avoid checking for insights before the txn is committed.
531+
txnDoneChan := make(chan struct{})
535532

536-
// Lock to wait for the txn to complete to avoid the test finishing before the txn is committed
537-
var wgTxnDone sync.WaitGroup
538-
wgTxnDone.Add(1)
533+
tx := conn.Begin(t)
539534

535+
_, errTxn := tx.ExecContext(ctx, "INSERT INTO t (id, s) VALUES ('test', 'originalValue');")
536+
require.NoError(t, errTxn)
537+
538+
waitingTxStartedChan := make(chan struct{})
539+
approxStmtRuntime := timeutil.NewStopWatch()
540540
go func() {
541-
tx, errTxn := conn.BeginTx(ctx, &gosql.TxOptions{})
542-
require.NoError(t, errTxn)
543-
_, errTxn = tx.ExecContext(ctx, "INSERT INTO t (id, s) VALUES ('test', 'originalValue');")
544-
require.NoError(t, errTxn)
545-
wgTxnStarted.Done()
546-
_, errTxn = tx.ExecContext(ctx, "select pg_sleep(.5);")
547-
require.NoError(t, errTxn)
548-
errTxn = tx.Commit()
549-
require.NoError(t, errTxn)
550-
wgTxnDone.Done()
541+
waitingTxStartedChan <- struct{}{}
542+
approxStmtRuntime.Start()
543+
// This will be blocked until the started txn above finishes.
544+
conn.Exec(t, "UPDATE t SET s = 'mainThread' where id = 'test';")
545+
approxStmtRuntime.Stop()
546+
txnDoneChan <- struct{}{}
551547
}()
552548

553-
start := timeutil.Now()
549+
<-waitingTxStartedChan
554550

555-
// Need to wait for the txn to start to ensure lock contention
556-
wgTxnStarted.Wait()
557-
// This will be blocked until the updateRowWithDelay finishes.
558-
_, err = conn.ExecContext(ctx, "UPDATE t SET s = 'mainThread' where id = 'test';")
559-
require.NoError(t, err)
560-
end := timeutil.Now()
561-
require.GreaterOrEqual(t, end.Sub(start), 500*time.Millisecond)
551+
_, errTxn = tx.ExecContext(ctx, "select pg_sleep(0.5);")
552+
require.NoError(t, errTxn)
553+
require.NoError(t, tx.Commit())
554+
555+
<-txnDoneChan
562556

563-
wgTxnDone.Wait()
557+
// Verify the approx run time was around 50ms. The pg_sleep should have blocked the stmt for at
558+
// least 500ms, but since the stopwatch doesn't measure the runtime exactly we'll use a much
559+
// smaller value that is >= the required insights threshold.
560+
require.GreaterOrEqualf(t,
561+
approxStmtRuntime.Elapsed().Milliseconds(), int64(100), "expected stmt to run for at least 100ms")
564562

565563
// Verify the table content is valid.
566564
testutils.SucceedsWithin(t, func() error {
567-
rows, err := conn.QueryContext(ctx, `SELECT
565+
rows, err := conn.DB.QueryContext(ctx, `SELECT
568566
query,
569567
insight.contention::FLOAT,
570568
sum(txn_contention.contention_duration)::FLOAT AS durationMs,
@@ -630,7 +628,7 @@ func TestInsightsIntegrationForContention(t *testing.T) {
630628
}
631629

632630
if rowCount < 1 {
633-
return fmt.Errorf("node_execution_insights did not return any rows")
631+
return fmt.Errorf("cluster_execution_insights did not return any rows")
634632
}
635633

636634
return nil

0 commit comments

Comments
 (0)