Skip to content

Commit 9849426

Browse files
craig[bot]rafiss
andcommitted
Merge #151431
151431: sql/lease: add progress logging to orphaned lease cleanup r=rafiss a=rafiss This change adds comprehensive logging to the orphaned lease cleanup operations to improve visibility during critical operations like upgrades. Previously, large numbers of orphaned leases could cause cleanup to run for extended periods without any indication of progress, potentially blocking upgrades and schema changes. Both cleanup functions include progress and completion logging. Periodic progress logging is added during large cleanup operations, along with warning logs for async task failures during lease releases. This addresses cases where orphaned lease cleanup could appear to hang or fail silently during upgrades, leaving operators without visibility into the cleanup progress. Resolves: #151411 Release note: None Co-authored-by: Rafi Shamim <[email protected]>
2 parents 3824cc2 + 7e104b3 commit 9849426

File tree

1 file changed

+50
-8
lines changed

1 file changed

+50
-8
lines changed

pkg/sql/catalog/lease/lease.go

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2431,11 +2431,17 @@ func (m *Manager) deleteOrphanedLeasesFromStaleSession(
24312431
region = locality.Tiers[0].Value
24322432
}
24332433

2434+
log.Infof(ctx, "starting orphaned lease cleanup from stale sessions in region %s", region)
2435+
24342436
var distinctSessions []tree.Datums
24352437
aostTime := hlc.Timestamp{WallTime: initialTimestamp}
24362438
distinctSessionQuery := `SELECT DISTINCT(session_id) FROM system.lease AS OF SYSTEM TIME %s WHERE crdb_region=$1 AND NOT crdb_internal.sql_liveness_is_alive(session_id, true) LIMIT $2`
24372439
syntheticDescriptors := catalog.Descriptors{systemschema.LeaseTable()}
24382440
const limit = 50
2441+
2442+
totalSessionsProcessed := 0
2443+
totalLeasesDeleted := 0
2444+
24392445
for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
24402446
// Get a list of distinct, dead session IDs that exist in the system.lease
24412447
// table.
@@ -2457,20 +2463,35 @@ func (m *Manager) deleteOrphanedLeasesFromStaleSession(
24572463
}
24582464
}
24592465

2466+
if len(distinctSessions) > 0 {
2467+
log.Infof(ctx, "found %d dead sessions from which to clean up orphaned leases", len(distinctSessions))
2468+
}
2469+
24602470
// Delete rows in our lease table with orphaned sessions.
24612471
for _, sessionRow := range distinctSessions {
24622472
sessionID := sqlliveness.SessionID(tree.MustBeDBytes(sessionRow[0]))
2463-
if err = deleteLeaseWithSessionIDWithBatch(ctx, ex, retryOpts, syntheticDescriptors, sessionID, region, limit); err != nil {
2464-
log.Warningf(ctx, "unable to delete orphaned leases: %v", err)
2473+
sessionLeasesDeleted, err := deleteLeaseWithSessionIDWithBatch(ctx, ex, retryOpts, syntheticDescriptors, sessionID, region, limit)
2474+
if err != nil {
2475+
log.Warningf(ctx, "unable to delete orphaned leases for session %s: %v", sessionID, err)
24652476
break
24662477
}
2478+
totalLeasesDeleted += sessionLeasesDeleted
2479+
log.Infof(ctx, "deleted %d orphaned leases for dead session %s", sessionLeasesDeleted, sessionID)
24672480
}
24682481

2482+
totalSessionsProcessed += len(distinctSessions)
2483+
24692484
// No more dead sessions to clean up.
24702485
if len(distinctSessions) < limit {
2486+
log.Infof(ctx, "completed orphaned lease cleanup for region %s: %d sessions processed, %d leases deleted",
2487+
region, totalSessionsProcessed, totalLeasesDeleted)
24712488
return
24722489
}
24732490

2491+
// Log progress for large cleanup operations.
2492+
log.Infof(ctx, "orphaned lease cleanup progress for region %s: %d sessions processed, %d leases deleted so far",
2493+
region, totalSessionsProcessed, totalLeasesDeleted)
2494+
24742495
// Advance our aostTime timstamp so that our query to detect leases with
24752496
// dead sessions is aware of new deletes and does not keep selecting the
24762497
// same leases.
@@ -2479,7 +2500,7 @@ func (m *Manager) deleteOrphanedLeasesFromStaleSession(
24792500
}
24802501

24812502
// deleteLeaseWithSessionIDWithBatch uses batchSize to batch deletes for leases
2482-
// with the given sessionID in system.lease.
2503+
// with the given sessionID in system.lease. Returns the total number of leases deleted.
24832504
func deleteLeaseWithSessionIDWithBatch(
24842505
ctx context.Context,
24852506
ex isql.Executor,
@@ -2488,7 +2509,8 @@ func deleteLeaseWithSessionIDWithBatch(
24882509
sessionID sqlliveness.SessionID,
24892510
region string,
24902511
batchSize int,
2491-
) error {
2512+
) (int, error) {
2513+
totalDeleted := 0
24922514
for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
24932515
var rowsDeleted int
24942516
deleteOrphanedQuery := `DELETE FROM system.lease WHERE session_id=$1 AND crdb_region=$2 LIMIT $3`
@@ -2503,16 +2525,17 @@ func deleteLeaseWithSessionIDWithBatch(
25032525
return err
25042526
}); err != nil {
25052527
if !startup.IsRetryableReplicaError(err) {
2506-
return err
2528+
return totalDeleted, err
25072529
}
25082530
}
2531+
totalDeleted += rowsDeleted
25092532

25102533
// No more rows to clean up.
25112534
if rowsDeleted < batchSize {
25122535
break
25132536
}
25142537
}
2515-
return nil
2538+
return totalDeleted, nil
25162539
}
25172540

25182541
func (m *Manager) deleteOrphanedLeasesWithSameInstanceID(
@@ -2548,8 +2571,15 @@ func (m *Manager) deleteOrphanedLeasesWithSameInstanceID(
25482571
log.Warningf(ctx, "unable to read orphaned leases: %v", err)
25492572
return
25502573
}
2574+
2575+
totalLeases := len(rows)
2576+
log.Infof(ctx, "found %d orphaned leases to clean up for instance ID %d", totalLeases, instanceID)
2577+
if totalLeases == 0 {
2578+
return
2579+
}
2580+
25512581
var wg sync.WaitGroup
2552-
defer wg.Wait()
2582+
var releasedCount atomic.Int64
25532583
for i := range rows {
25542584
// Early exit?
25552585
row := rows[i]
@@ -2574,13 +2604,25 @@ func (m *Manager) deleteOrphanedLeasesWithSameInstanceID(
25742604
WaitForSem: true,
25752605
},
25762606
func(ctx context.Context) {
2607+
defer wg.Done()
25772608
m.storage.release(ctx, m.stopper, lease)
2609+
released := releasedCount.Add(1)
25782610
log.Infof(ctx, "released orphaned lease: %+v", lease)
2579-
wg.Done()
2611+
2612+
// Log progress every 100 leases for large cleanup operations.
2613+
if released%100 == 0 || released == int64(totalLeases) {
2614+
log.Infof(ctx, "orphaned lease cleanup progress for instance ID %d: %d/%d leases released",
2615+
instanceID, released, totalLeases)
2616+
}
25802617
}); err != nil {
2618+
log.Warningf(ctx, "could not start async task for releasing orphaned lease %+v: %v", lease, err)
25812619
wg.Done()
25822620
}
25832621
}
2622+
2623+
wg.Wait()
2624+
log.Infof(ctx, "completed orphaned lease cleanup for instance ID %d: %d/%d leases released",
2625+
instanceID, releasedCount.Load(), totalLeases)
25842626
}
25852627

25862628
// TestingGetBoundAccount returns the bound account used by the lease manager.

0 commit comments

Comments
 (0)