Skip to content

Commit 431351c

Browse files
craig[bot]fqazi
andcommitted
151202: catalog/lease: fix hang on range feed restart r=fqazi a=fqazi Previously, the test was hanging because the lease manager mutex was held while closing the range feed. While a real failure would likely occur in the middle of a range feed callback during a restart, the synthetic scenario in this test made a hang possible. To avoid this, the lease manager lock is now released before closing the range feed. Fixes: #150679 Fixes: #151088 Release note: None 151415: roachtest: deflake psycopg r=fqazi a=fqazi Previously, the DNS tests would flake based on the order of hostnames in the output. This patch places these tests on the ignore list. Fixes: #151379 Fixes: #150651 Release note: None Co-authored-by: Faizan Qazi <[email protected]>
3 parents 4ce0faf + 83e2511 + 3f276a8 commit 431351c

File tree

3 files changed

+40
-7
lines changed

3 files changed

+40
-7
lines changed

pkg/cmd/roachtest/tests/psycopg_blocklist.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ package tests
1919
var psycopgBlockList = blocklist{}
2020

2121
var psycopgIgnoreList = blocklist{
22+
`tests.test_dns_srv.test_srv[host=_pg._tcp.bar.com-host=db1.example.com,db4.example.com,db3.example.com,db2.example.com port=5432,5432,5433,5432-None]`: "flaky; see #151379",
23+
`tests.test_dns_srv.test_srv_async[asyncio-host=_pg._tcp.bar.com-host=db1.example.com,db4.example.com,db3.example.com,db2.example.com port=5432,5432,5433,5432-None]`: "flaky; see #151379",
2224
`tests.pool.test_pool.test_connect_check_timeout`: "requires insecure mode",
2325
`tests.pool.test_pool.test_reconnect`: "requires insecure mode",
2426
`tests.pool.test_pool.test_reconnect_after_grow_failed`: "requires insecure mode",

pkg/sql/catalog/lease/lease.go

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,6 +1107,9 @@ type Manager struct {
11071107

11081108
// rangeFeed current range feed on system.descriptors.
11091109
rangeFeed *rangefeed.RangeFeed
1110+
1111+
// rangeFeedRestartInProgress tracks if a range feed restart is in progress.
1112+
rangeFeedRestartInProgress bool
11101113
}
11111114

11121115
// closeTimeStamp for the range feed, which is the timestamp
@@ -1786,6 +1789,19 @@ func (m *Manager) GetSafeReplicationTS() hlc.Timestamp {
17861789
return m.closeTimestamp.Load().(hlc.Timestamp)
17871790
}
17881791

1792+
// closeRangeFeed closes the currently open range feed, which will involve
1793+
// temporarily releasing the lease manager mutex.
1794+
func (m *Manager) closeRangeFeedLocked() {
1795+
// We cannot terminate the range feed while holding the lease manager
1796+
// lock, since there may be event handlers that need the lock that need to
1797+
// drain.
1798+
oldRangeFeed := m.mu.rangeFeed
1799+
m.mu.rangeFeed = nil
1800+
m.mu.Unlock() // nolint:deferunlockcheck
1801+
oldRangeFeed.Close()
1802+
m.mu.Lock() // nolint:deferunlockcheck
1803+
}
1804+
17891805
// watchForUpdates will watch a rangefeed on the system.descriptor table for
17901806
// updates.
17911807
func (m *Manager) watchForUpdates(ctx context.Context) {
@@ -1847,14 +1863,12 @@ func (m *Manager) watchForUpdates(ctx context.Context) {
18471863
m.closeTimestamp.Store(checkpoint.ResolvedTS)
18481864
}
18491865

1850-
// If we already started a range feed terminate it first
1866+
// Assert that the range feed is already terminated.
18511867
if m.mu.rangeFeed != nil {
1852-
m.mu.rangeFeed.Close()
1853-
m.mu.rangeFeed = nil
1854-
if m.testingKnobs.RangeFeedResetChannel != nil {
1855-
close(m.testingKnobs.RangeFeedResetChannel)
1856-
m.testingKnobs.RangeFeedResetChannel = nil
1868+
if buildutil.CrdbTestBuild {
1869+
panic(errors.AssertionFailedf("range feed was not closed before a restart attempt"))
18571870
}
1871+
log.Warningf(ctx, "range feed was not closed before a restart attempt")
18581872
}
18591873
// Ignore errors here because they indicate that the server is shutting down.
18601874
// Also note that the range feed automatically shuts down when the server
@@ -2011,12 +2025,28 @@ func (m *Manager) handleRangeFeedError(ctx context.Context) {
20112025
}
20122026

20132027
func (m *Manager) restartLeasingRangeFeedLocked(ctx context.Context) {
2028+
// If someone else is already starting a range feed then exit early.
2029+
if m.mu.rangeFeedRestartInProgress {
2030+
return
2031+
}
20142032
log.Warning(ctx, "attempting restart of leasing range feed")
2033+
// We will temporarily release the lock closing the range feed,
2034+
// in case things need to drain before termination. It is possible for
2035+
// another restart to enter once we release the lock.
2036+
m.mu.rangeFeedRestartInProgress = true
2037+
if m.mu.rangeFeed != nil {
2038+
m.closeRangeFeedLocked()
2039+
if m.testingKnobs.RangeFeedResetChannel != nil {
2040+
close(m.testingKnobs.RangeFeedResetChannel)
2041+
m.testingKnobs.RangeFeedResetChannel = nil
2042+
}
2043+
}
20152044
// Attempt a range feed restart if it has been down too long.
20162045
m.watchForUpdates(ctx)
20172046
// Track when the last restart occurred.
20182047
m.mu.rangeFeedIsUnavailableAt = timeutil.Now()
20192048
m.mu.rangeFeedCheckpoints = 0
2049+
m.mu.rangeFeedRestartInProgress = false
20202050
}
20212051

20222052
// cleanupExpiredSessionLeases expires session based leases marked for removal,

pkg/sql/catalog/lease/lease_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3189,7 +3189,8 @@ func TestAmbiguousResultIsRetried(t *testing.T) {
31893189
func TestLeaseDescriptorRangeFeedFailure(t *testing.T) {
31903190
defer leaktest.AfterTest(t)()
31913191
defer log.Scope(t).Close(t)
3192-
skip.UnderDuress(t)
3192+
// Too slow for execution under race.
3193+
skip.UnderRace(t)
31933194

31943195
settings := cluster.MakeTestingClusterSettings()
31953196
ctx := context.Background()

0 commit comments

Comments
 (0)