Skip to content

Commit 6abab18

Browse files
committed
catalog/lease: fix error handling for purgeOldVersions
Previously, when purging old versions we would acquire a lease on the latest version, which was introduced when we added logic for acquiring leases on the previous version. The logic to acquire the previous version would clear the error, preventing errors from correctly surfacing and causing issues with dropped / offline descriptors. To address this, ensure the acquisition logic for the previous version is only executed if a clean up will occur. This patch also enhances logging for error scenarios where we could fail to acquire new leases. Informs: #156176 Release note: None
1 parent 5dd32a1 commit 6abab18

File tree

2 files changed

+30
-9
lines changed

2 files changed

+30
-9
lines changed

pkg/sql/catalog/lease/descriptor_state.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ func (t *descriptorState) findForTimestampImpl(
138138

139139
// If we have the initial version of the descriptor, and it satisfies the read
140140
// timestamp, then the object was just created. We can confirm it satisfies
141-
// the request, by executing findForTimestampImpl with the readTimestamp instead.
141+
// the request by executing findForTimestampImpl with the readTimestamp instead.
142142
if oldest := t.mu.active.findOldest(); hasDifferentReadTimeStamp &&
143143
oldest != nil &&
144144
oldest.GetVersion() == 1 &&

pkg/sql/catalog/lease/lease.go

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,7 +1274,7 @@ func (m *Manager) purgeOldVersions(
12741274
retry.Options{
12751275
MaxDuration: time.Second * 30}); r.Next(); {
12761276
// Acquire a refcount on the descriptor on the latest version to maintain an
1277-
// active lease, so that it doesn't get released when removeInactives()
1277+
// active lease so that it doesn't get released when removeInactives()
12781278
// is called below. Release this lease after calling removeInactives().
12791279
desc, _, err = t.findForTimestamp(ctx, TimestampToReadTimestamp(m.storage.clock.Now()))
12801280
if err == nil || !errors.Is(err, errRenewLease) {
@@ -1290,7 +1290,7 @@ func (m *Manager) purgeOldVersions(
12901290
// Assert this should never happen due to a fixed expiration, since the range
12911291
// feed is responsible for purging old versions and acquiring new versions.
12921292
if newest.hasFixedExpiration() {
1293-
return errors.AssertionFailedf("the latest version of the descriptor has" +
1293+
return errors.AssertionFailedf("the latest version of the descriptor has " +
12941294
"a fixed expiration, this should never happen")
12951295
}
12961296
// Otherwise, we ran into some type of transient issue, where the sqllivness
@@ -1307,9 +1307,10 @@ func (m *Manager) purgeOldVersions(
13071307
err = nil
13081308
}
13091309

1310-
// Optionally, acquire the refcount on the previous version.
1310+
// Optionally, acquire the refcount on the previous version for the locked
1311+
// leasing mode.
13111312
acquireLeaseOnPrevious := func() error {
1312-
if dropped || !LockedLeaseTimestamp.Get(&m.storage.settings.SV) {
1313+
if !LockedLeaseTimestamp.Get(&m.storage.settings.SV) {
13131314
return nil
13141315
}
13151316
var handles []*closeTimeStampHandle
@@ -1376,11 +1377,12 @@ func (m *Manager) purgeOldVersions(
13761377
return gatheredErrors
13771378
}
13781379

1379-
if err = acquireLeaseOnPrevious(); err != nil {
1380-
log.Dev.Errorf(ctx, "unable to acquire lease on previous version of descriptor: %s", err)
1381-
}
1382-
13831380
if isInactive := catalog.HasInactiveDescriptorError(err); err == nil || isInactive {
1381+
// If previous versions are released, then acquire a lease on the previous
1382+
// version for the locked leasing mode.
1383+
if acquirePreviousErr := acquireLeaseOnPrevious(); acquirePreviousErr != nil {
1384+
log.Dev.Errorf(ctx, "unable to acquire lease on previous version of descriptor: %s", acquirePreviousErr)
1385+
}
13841386
removeInactives(isInactive)
13851387
if desc != nil {
13861388
t.release(ctx, desc)
@@ -2325,12 +2327,25 @@ func (m *Manager) StartRefreshLeasesTask(ctx context.Context, s *stop.Stopper, d
23252327
defer m.initComplete.Swap(true)
23262328
m.watchForUpdates(ctx)
23272329
_ = s.RunAsyncTask(ctx, "refresh-leases", func(ctx context.Context) {
2330+
defer func() {
2331+
if err := recover(); err != nil {
2332+
log.Dev.Warningf(ctx, "panic in refresh-leases: %v", err)
2333+
panic(err)
2334+
}
2335+
}()
2336+
23282337
for {
23292338
select {
23302339
case id := <-m.descDelCh:
23312340
// Descriptor is marked as deleted, so mark it for deletion or
23322341
// remove it if it's no longer in use.
23332342
_ = s.RunAsyncTask(ctx, "purgeOldVersionsOrAcquireInitialVersion deleted descriptor", func(ctx context.Context) {
2343+
defer func() {
2344+
if err := recover(); err != nil {
2345+
log.Dev.Warningf(ctx, "panic in purgeOldVersionsOrAcquireInitialVersion deleted descriptor: %v", err)
2346+
panic(err)
2347+
}
2348+
}()
23342349
// Once the descriptor is purged notify that some change has occurred.
23352350
defer m.leaseGeneration.Add(1)
23362351
state := m.findNewest(id)
@@ -2412,6 +2427,12 @@ func (m *Manager) StartRefreshLeasesTask(ctx context.Context, s *stop.Stopper, d
24122427
// of increased latency right as the descriptor has been committed.
24132428
if now := db.Clock().Now(); now.Less(desc.GetModificationTime()) {
24142429
_ = s.RunAsyncTask(ctx, "wait to purgeOldVersionsOrAcquireInitialVersion", func(ctx context.Context) {
2430+
defer func() {
2431+
if err := recover(); err != nil {
2432+
log.Dev.Warningf(ctx, "panic in wait to purgeOldVersionsOrAcquireInitialVersion: %v", err)
2433+
panic(err)
2434+
}
2435+
}()
24152436
toWait := time.Duration(desc.GetModificationTime().WallTime - now.WallTime)
24162437
select {
24172438
case <-time.After(toWait):

0 commit comments

Comments
 (0)