Skip to content

Commit e18c6f6

Browse files
committed
sql/lease: add progress logging to orphaned lease cleanup
This change adds comprehensive logging to the orphaned lease cleanup operations to improve visibility during critical operations like upgrades. Previously, large numbers of orphaned leases could cause cleanup to run for extended periods without any indication of progress, potentially blocking upgrades and schema changes. Both cleanup functions include progress and completion logging. Periodic progress logging is added during large cleanup operations, along with warning logs for async task failures during lease releases. This addresses cases where orphaned lease cleanup could appear to hang or fail silently during upgrades, leaving operators without visibility into the cleanup progress. Resolves: #151411 Release note: None
1 parent 6c3f184 commit e18c6f6

File tree

1 file changed

+50
-8
lines changed

1 file changed

+50
-8
lines changed

pkg/sql/catalog/lease/lease.go

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2079,11 +2079,17 @@ func (m *Manager) deleteOrphanedLeasesFromStaleSession(
20792079
region = locality.Tiers[0].Value
20802080
}
20812081

2082+
log.Infof(ctx, "starting orphaned lease cleanup from stale sessions in region %s", region)
2083+
20822084
var distinctSessions []tree.Datums
20832085
aostTime := hlc.Timestamp{WallTime: initialTimestamp}
20842086
distinctSessionQuery := `SELECT DISTINCT(session_id) FROM system.lease AS OF SYSTEM TIME %s WHERE crdb_region=$1 AND NOT crdb_internal.sql_liveness_is_alive(session_id, true) LIMIT $2`
20852087
syntheticDescriptors := catalog.Descriptors{systemschema.LeaseTable()}
20862088
const limit = 50
2089+
2090+
totalSessionsProcessed := 0
2091+
totalLeasesDeleted := 0
2092+
20872093
for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
20882094
// Get a list of distinct, dead session IDs that exist in the system.lease
20892095
// table.
@@ -2105,20 +2111,35 @@ func (m *Manager) deleteOrphanedLeasesFromStaleSession(
21052111
}
21062112
}
21072113

2114+
if len(distinctSessions) > 0 {
2115+
log.Infof(ctx, "found %d dead sessions from which to clean up orphaned leases", len(distinctSessions))
2116+
}
2117+
21082118
// Delete rows in our lease table with orphaned sessions.
21092119
for _, sessionRow := range distinctSessions {
21102120
sessionID := sqlliveness.SessionID(tree.MustBeDBytes(sessionRow[0]))
2111-
if err = deleteLeaseWithSessionIDWithBatch(ctx, ex, retryOpts, syntheticDescriptors, sessionID, region, limit); err != nil {
2112-
log.Warningf(ctx, "unable to delete orphaned leases: %v", err)
2121+
sessionLeasesDeleted, err := deleteLeaseWithSessionIDWithBatch(ctx, ex, retryOpts, syntheticDescriptors, sessionID, region, limit)
2122+
if err != nil {
2123+
log.Warningf(ctx, "unable to delete orphaned leases for session %s: %v", sessionID, err)
21132124
break
21142125
}
2126+
totalLeasesDeleted += sessionLeasesDeleted
2127+
log.Infof(ctx, "deleted %d orphaned leases for dead session %s", sessionLeasesDeleted, sessionID)
21152128
}
21162129

2130+
totalSessionsProcessed += len(distinctSessions)
2131+
21172132
// No more dead sessions to clean up.
21182133
if len(distinctSessions) < limit {
2134+
log.Infof(ctx, "completed orphaned lease cleanup for region %s: %d sessions processed, %d leases deleted",
2135+
region, totalSessionsProcessed, totalLeasesDeleted)
21192136
return
21202137
}
21212138

2139+
// Log progress for large cleanup operations.
2140+
log.Infof(ctx, "orphaned lease cleanup progress for region %s: %d sessions processed, %d leases deleted so far",
2141+
region, totalSessionsProcessed, totalLeasesDeleted)
2142+
21222143
// Advance our aostTime timstamp so that our query to detect leases with
21232144
// dead sessions is aware of new deletes and does not keep selecting the
21242145
// same leases.
@@ -2127,7 +2148,7 @@ func (m *Manager) deleteOrphanedLeasesFromStaleSession(
21272148
}
21282149

21292150
// deleteLeaseWithSessionIDWithBatch uses batchSize to batch deletes for leases
2130-
// with the given sessionID in system.lease.
2151+
// with the given sessionID in system.lease. Returns the total number of leases deleted.
21312152
func deleteLeaseWithSessionIDWithBatch(
21322153
ctx context.Context,
21332154
ex isql.Executor,
@@ -2136,7 +2157,8 @@ func deleteLeaseWithSessionIDWithBatch(
21362157
sessionID sqlliveness.SessionID,
21372158
region string,
21382159
batchSize int,
2139-
) error {
2160+
) (int, error) {
2161+
totalDeleted := 0
21402162
for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
21412163
var rowsDeleted int
21422164
deleteOrphanedQuery := `DELETE FROM system.lease WHERE session_id=$1 AND crdb_region=$2 LIMIT $3`
@@ -2151,16 +2173,17 @@ func deleteLeaseWithSessionIDWithBatch(
21512173
return err
21522174
}); err != nil {
21532175
if !startup.IsRetryableReplicaError(err) {
2154-
return err
2176+
return totalDeleted, err
21552177
}
21562178
}
2179+
totalDeleted += rowsDeleted
21572180

21582181
// No more rows to clean up.
21592182
if rowsDeleted < batchSize {
21602183
break
21612184
}
21622185
}
2163-
return nil
2186+
return totalDeleted, nil
21642187
}
21652188

21662189
func (m *Manager) deleteOrphanedLeasesWithSameInstanceID(
@@ -2196,8 +2219,15 @@ func (m *Manager) deleteOrphanedLeasesWithSameInstanceID(
21962219
log.Warningf(ctx, "unable to read orphaned leases: %v", err)
21972220
return
21982221
}
2222+
2223+
totalLeases := len(rows)
2224+
log.Infof(ctx, "found %d orphaned leases to clean up for instance ID %d", totalLeases, instanceID)
2225+
if totalLeases == 0 {
2226+
return
2227+
}
2228+
21992229
var wg sync.WaitGroup
2200-
defer wg.Wait()
2230+
var releasedCount atomic.Int64
22012231
for i := range rows {
22022232
// Early exit?
22032233
row := rows[i]
@@ -2222,11 +2252,23 @@ func (m *Manager) deleteOrphanedLeasesWithSameInstanceID(
22222252
WaitForSem: true,
22232253
},
22242254
func(ctx context.Context) {
2255+
defer wg.Done()
22252256
m.storage.release(ctx, m.stopper, lease)
2257+
released := releasedCount.Add(1)
22262258
log.Infof(ctx, "released orphaned lease: %+v", lease)
2227-
wg.Done()
2259+
2260+
// Log progress every 100 leases for large cleanup operations.
2261+
if released%100 == 0 || released == int64(totalLeases) {
2262+
log.Infof(ctx, "orphaned lease cleanup progress for instance ID %d: %d/%d leases released",
2263+
instanceID, released, totalLeases)
2264+
}
22282265
}); err != nil {
2266+
log.Warningf(ctx, "could not start async task for releasing orphaned lease %+v: %v", lease, err)
22292267
wg.Done()
22302268
}
22312269
}
2270+
2271+
wg.Wait()
2272+
log.Infof(ctx, "completed orphaned lease cleanup for instance ID %d: %d/%d leases released",
2273+
instanceID, releasedCount.Load(), totalLeases)
22322274
}

0 commit comments

Comments
 (0)