Skip to content

Commit 83e2511

Browse files
committed
catalog/lease: fix hang on range feed restart
Previously, the test was hanging because the lease manager mutex was held while closing the range feed. While a real failure would likely occur in the middle of a range feed callback during a restart, the synthetic scenario in this test made a hang possible. To avoid this, the lease manager lock is now released before closing the range feed. Fixes: #150679 Fixes: #151088 Release note: None
1 parent 2a7b3c3 commit 83e2511

File tree

2 files changed

+38
-7
lines changed

2 files changed

+38
-7
lines changed

pkg/sql/catalog/lease/lease.go

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,6 +1107,9 @@ type Manager struct {
11071107

11081108
// rangeFeed current range feed on system.descriptors.
11091109
rangeFeed *rangefeed.RangeFeed
1110+
1111+
// rangeFeedRestartInProgress tracks if a range feed restart is in progress.
1112+
rangeFeedRestartInProgress bool
11101113
}
11111114

11121115
// closeTimeStamp for the range feed, which is the timestamp
@@ -1786,6 +1789,19 @@ func (m *Manager) GetSafeReplicationTS() hlc.Timestamp {
17861789
return m.closeTimestamp.Load().(hlc.Timestamp)
17871790
}
17881791

1792+
// closeRangeFeed closes the currently open range feed, which will involve
1793+
// temporarily releasing the lease manager mutex.
1794+
func (m *Manager) closeRangeFeedLocked() {
1795+
// We cannot terminate the range feed while holding the lease manager
1796+
// lock, since there may be event handlers that need the lock that need to
1797+
// drain.
1798+
oldRangeFeed := m.mu.rangeFeed
1799+
m.mu.rangeFeed = nil
1800+
m.mu.Unlock() // nolint:deferunlockcheck
1801+
oldRangeFeed.Close()
1802+
m.mu.Lock() // nolint:deferunlockcheck
1803+
}
1804+
17891805
// watchForUpdates will watch a rangefeed on the system.descriptor table for
17901806
// updates.
17911807
func (m *Manager) watchForUpdates(ctx context.Context) {
@@ -1847,14 +1863,12 @@ func (m *Manager) watchForUpdates(ctx context.Context) {
18471863
m.closeTimestamp.Store(checkpoint.ResolvedTS)
18481864
}
18491865

1850-
// If we already started a range feed terminate it first
1866+
// Assert that the range feed is already terminated.
18511867
if m.mu.rangeFeed != nil {
1852-
m.mu.rangeFeed.Close()
1853-
m.mu.rangeFeed = nil
1854-
if m.testingKnobs.RangeFeedResetChannel != nil {
1855-
close(m.testingKnobs.RangeFeedResetChannel)
1856-
m.testingKnobs.RangeFeedResetChannel = nil
1868+
if buildutil.CrdbTestBuild {
1869+
panic(errors.AssertionFailedf("range feed was not closed before a restart attempt"))
18571870
}
1871+
log.Warningf(ctx, "range feed was not closed before a restart attempt")
18581872
}
18591873
// Ignore errors here because they indicate that the server is shutting down.
18601874
// Also note that the range feed automatically shuts down when the server
@@ -2011,12 +2025,28 @@ func (m *Manager) handleRangeFeedError(ctx context.Context) {
20112025
}
20122026

20132027
func (m *Manager) restartLeasingRangeFeedLocked(ctx context.Context) {
2028+
// If someone else is already starting a range feed then exit early.
2029+
if m.mu.rangeFeedRestartInProgress {
2030+
return
2031+
}
20142032
log.Warning(ctx, "attempting restart of leasing range feed")
2033+
// We will temporarily release the lock closing the range feed,
2034+
// in case things need to drain before termination. It is possible for
2035+
// another restart to enter once we release the lock.
2036+
m.mu.rangeFeedRestartInProgress = true
2037+
if m.mu.rangeFeed != nil {
2038+
m.closeRangeFeedLocked()
2039+
if m.testingKnobs.RangeFeedResetChannel != nil {
2040+
close(m.testingKnobs.RangeFeedResetChannel)
2041+
m.testingKnobs.RangeFeedResetChannel = nil
2042+
}
2043+
}
20152044
// Attempt a range feed restart if it has been down too long.
20162045
m.watchForUpdates(ctx)
20172046
// Track when the last restart occurred.
20182047
m.mu.rangeFeedIsUnavailableAt = timeutil.Now()
20192048
m.mu.rangeFeedCheckpoints = 0
2049+
m.mu.rangeFeedRestartInProgress = false
20202050
}
20212051

20222052
// cleanupExpiredSessionLeases expires session based leases marked for removal,

pkg/sql/catalog/lease/lease_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3189,7 +3189,8 @@ func TestAmbiguousResultIsRetried(t *testing.T) {
31893189
func TestLeaseDescriptorRangeFeedFailure(t *testing.T) {
31903190
defer leaktest.AfterTest(t)()
31913191
defer log.Scope(t).Close(t)
3192-
skip.UnderDuress(t)
3192+
// Too slow for execution under race.
3193+
skip.UnderRace(t)
31933194

31943195
settings := cluster.MakeTestingClusterSettings()
31953196
ctx := context.Background()

0 commit comments

Comments
 (0)