Skip to content

Commit 7e104b3

Browse files
committed
sql/lease: add progress logging to orphaned lease cleanup
This change adds comprehensive logging to the orphaned lease cleanup operations to improve visibility during critical operations like upgrades. Previously, large numbers of orphaned leases could cause cleanup to run for extended periods without any indication of progress, potentially blocking upgrades and schema changes. Both cleanup functions include progress and completion logging. Periodic progress logging is added during large cleanup operations, along with warning logs for async task failures during lease releases. This addresses cases where orphaned lease cleanup could appear to hang or fail silently during upgrades, leaving operators without visibility into the cleanup progress. Resolves: #151411 Release note: None
1 parent 5aaab78 commit 7e104b3

File tree

1 file changed

+50
-8
lines changed

1 file changed

+50
-8
lines changed

pkg/sql/catalog/lease/lease.go

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2401,11 +2401,17 @@ func (m *Manager) deleteOrphanedLeasesFromStaleSession(
24012401
region = locality.Tiers[0].Value
24022402
}
24032403

2404+
log.Infof(ctx, "starting orphaned lease cleanup from stale sessions in region %s", region)
2405+
24042406
var distinctSessions []tree.Datums
24052407
aostTime := hlc.Timestamp{WallTime: initialTimestamp}
24062408
distinctSessionQuery := `SELECT DISTINCT(session_id) FROM system.lease AS OF SYSTEM TIME %s WHERE crdb_region=$1 AND NOT crdb_internal.sql_liveness_is_alive(session_id, true) LIMIT $2`
24072409
syntheticDescriptors := catalog.Descriptors{systemschema.LeaseTable()}
24082410
const limit = 50
2411+
2412+
totalSessionsProcessed := 0
2413+
totalLeasesDeleted := 0
2414+
24092415
for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
24102416
// Get a list of distinct, dead session IDs that exist in the system.lease
24112417
// table.
@@ -2427,20 +2433,35 @@ func (m *Manager) deleteOrphanedLeasesFromStaleSession(
24272433
}
24282434
}
24292435

2436+
if len(distinctSessions) > 0 {
2437+
log.Infof(ctx, "found %d dead sessions from which to clean up orphaned leases", len(distinctSessions))
2438+
}
2439+
24302440
// Delete rows in our lease table with orphaned sessions.
24312441
for _, sessionRow := range distinctSessions {
24322442
sessionID := sqlliveness.SessionID(tree.MustBeDBytes(sessionRow[0]))
2433-
if err = deleteLeaseWithSessionIDWithBatch(ctx, ex, retryOpts, syntheticDescriptors, sessionID, region, limit); err != nil {
2434-
log.Warningf(ctx, "unable to delete orphaned leases: %v", err)
2443+
sessionLeasesDeleted, err := deleteLeaseWithSessionIDWithBatch(ctx, ex, retryOpts, syntheticDescriptors, sessionID, region, limit)
2444+
if err != nil {
2445+
log.Warningf(ctx, "unable to delete orphaned leases for session %s: %v", sessionID, err)
24352446
break
24362447
}
2448+
totalLeasesDeleted += sessionLeasesDeleted
2449+
log.Infof(ctx, "deleted %d orphaned leases for dead session %s", sessionLeasesDeleted, sessionID)
24372450
}
24382451

2452+
totalSessionsProcessed += len(distinctSessions)
2453+
24392454
// No more dead sessions to clean up.
24402455
if len(distinctSessions) < limit {
2456+
log.Infof(ctx, "completed orphaned lease cleanup for region %s: %d sessions processed, %d leases deleted",
2457+
region, totalSessionsProcessed, totalLeasesDeleted)
24412458
return
24422459
}
24432460

2461+
// Log progress for large cleanup operations.
2462+
log.Infof(ctx, "orphaned lease cleanup progress for region %s: %d sessions processed, %d leases deleted so far",
2463+
region, totalSessionsProcessed, totalLeasesDeleted)
2464+
24442465
// Advance our aostTime timstamp so that our query to detect leases with
24452466
// dead sessions is aware of new deletes and does not keep selecting the
24462467
// same leases.
@@ -2449,7 +2470,7 @@ func (m *Manager) deleteOrphanedLeasesFromStaleSession(
24492470
}
24502471

24512472
// deleteLeaseWithSessionIDWithBatch uses batchSize to batch deletes for leases
2452-
// with the given sessionID in system.lease.
2473+
// with the given sessionID in system.lease. Returns the total number of leases deleted.
24532474
func deleteLeaseWithSessionIDWithBatch(
24542475
ctx context.Context,
24552476
ex isql.Executor,
@@ -2458,7 +2479,8 @@ func deleteLeaseWithSessionIDWithBatch(
24582479
sessionID sqlliveness.SessionID,
24592480
region string,
24602481
batchSize int,
2461-
) error {
2482+
) (int, error) {
2483+
totalDeleted := 0
24622484
for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
24632485
var rowsDeleted int
24642486
deleteOrphanedQuery := `DELETE FROM system.lease WHERE session_id=$1 AND crdb_region=$2 LIMIT $3`
@@ -2473,16 +2495,17 @@ func deleteLeaseWithSessionIDWithBatch(
24732495
return err
24742496
}); err != nil {
24752497
if !startup.IsRetryableReplicaError(err) {
2476-
return err
2498+
return totalDeleted, err
24772499
}
24782500
}
2501+
totalDeleted += rowsDeleted
24792502

24802503
// No more rows to clean up.
24812504
if rowsDeleted < batchSize {
24822505
break
24832506
}
24842507
}
2485-
return nil
2508+
return totalDeleted, nil
24862509
}
24872510

24882511
func (m *Manager) deleteOrphanedLeasesWithSameInstanceID(
@@ -2518,8 +2541,15 @@ func (m *Manager) deleteOrphanedLeasesWithSameInstanceID(
25182541
log.Warningf(ctx, "unable to read orphaned leases: %v", err)
25192542
return
25202543
}
2544+
2545+
totalLeases := len(rows)
2546+
log.Infof(ctx, "found %d orphaned leases to clean up for instance ID %d", totalLeases, instanceID)
2547+
if totalLeases == 0 {
2548+
return
2549+
}
2550+
25212551
var wg sync.WaitGroup
2522-
defer wg.Wait()
2552+
var releasedCount atomic.Int64
25232553
for i := range rows {
25242554
// Early exit?
25252555
row := rows[i]
@@ -2544,13 +2574,25 @@ func (m *Manager) deleteOrphanedLeasesWithSameInstanceID(
25442574
WaitForSem: true,
25452575
},
25462576
func(ctx context.Context) {
2577+
defer wg.Done()
25472578
m.storage.release(ctx, m.stopper, lease)
2579+
released := releasedCount.Add(1)
25482580
log.Infof(ctx, "released orphaned lease: %+v", lease)
2549-
wg.Done()
2581+
2582+
// Log progress every 100 leases for large cleanup operations.
2583+
if released%100 == 0 || released == int64(totalLeases) {
2584+
log.Infof(ctx, "orphaned lease cleanup progress for instance ID %d: %d/%d leases released",
2585+
instanceID, released, totalLeases)
2586+
}
25502587
}); err != nil {
2588+
log.Warningf(ctx, "could not start async task for releasing orphaned lease %+v: %v", lease, err)
25512589
wg.Done()
25522590
}
25532591
}
2592+
2593+
wg.Wait()
2594+
log.Infof(ctx, "completed orphaned lease cleanup for instance ID %d: %d/%d leases released",
2595+
instanceID, releasedCount.Load(), totalLeases)
25542596
}
25552597

25562598
// TestingGetBoundAccount returns the bound account used by the lease manager.

0 commit comments

Comments
 (0)