@@ -46,7 +46,7 @@ internal sealed partial class GrainDirectoryPartition : SystemTarget, IGrainDire
4646
4747 private readonly TimeSpan _leaseHoldDuration ;
4848 private readonly List < ( RingRange Range , DateTime Expiration ) > _rangeLeaseHolds = [ ] ;
49- private readonly Dictionary < GrainId , ( SiloAddress DeadSilo , DateTime LeaseExpiration ) > _grainLeaseHolds = [ ] ;
49+ private readonly Dictionary < SiloAddress , DateTime > _siloLeaseHolds = [ ] ;
5050
5151 /// <param name="partitionIndex">The index of this partition on this silo. Each silo hosts a fixed number of dynamically sized partitions.</param>
5252 public GrainDirectoryPartition (
@@ -258,37 +258,38 @@ private void OnSiloRemovedFromCluster(ClusterMember change, SiloStatus previousS
258258 // If it was ShuttingDown, it surrendered its ownership gracefully.
259259 // If it was Active (or Joining) and suddenly became Dead, it crashed.
260260
261- var isUngraceful = previousStatus is not SiloStatus . ShuttingDown ;
262- var expiration = _timeProvider . GetUtcNow ( ) . UtcDateTime . Add ( _leaseHoldDuration ) ;
263- var toRemove = new List < GrainAddress > ( ) ;
261+ if ( previousStatus is not SiloStatus . ShuttingDown && _leaseHoldDuration > TimeSpan . Zero )
262+ {
263+ // Instead of just deleting, we mark it as tombstoned.
264+ // This prevents a new activation on a healthy silo from registering
265+ // until we are sure the dead silo has actually stopped processing.
266+
267+ var expiration = _timeProvider . GetUtcNow ( ) . UtcDateTime . Add ( _leaseHoldDuration ) ;
264268
265- foreach ( var entry in _directory )
269+ _siloLeaseHolds [ change . SiloAddress ] = expiration ;
270+
271+ LogDebugLeaseHoldForSilo ( _logger , change . SiloAddress , expiration ) ;
272+ }
273+ else
266274 {
267- if ( change . SiloAddress . Equals ( entry . Value . SiloAddress ) )
268- {
269- toRemove . Add ( entry . Value ) ;
275+ var toRemove = new List < GrainAddress > ( ) ;
270276
271- if ( isUngraceful )
277+ foreach ( var entry in _directory )
278+ {
279+ if ( change . SiloAddress . Equals ( entry . Value . SiloAddress ) )
272280 {
273- // Instead of just deleting, we mark it as tombstoned.
274- // This prevents a new activation on a healthy silo from registering
275- // until we are sure the dead silo has actually stopped processing.
276- // So we block re-registration of this specific grain id if we did not already,
277- // or extend based on the new expiration time.
278-
279- _grainLeaseHolds [ entry . Key ] = ( change . SiloAddress , expiration ) ;
280- LogDebugLeaseHoldForGrain ( _logger , entry . Key , change . SiloAddress , expiration ) ;
281+ toRemove . Add ( entry . Value ) ;
281282 }
282283 }
283- }
284-
285- if ( toRemove . Count > 0 )
286- {
287- LogDebugDeletingEntries ( _logger , toRemove . Count , change . SiloAddress ) ;
288284
289- foreach ( var grainAddress in toRemove )
285+ if ( toRemove . Count > 0 )
290286 {
291- DeregisterCore ( grainAddress ) ;
287+ LogDebugDeletingEntries ( _logger , toRemove . Count , change . SiloAddress ) ;
288+
289+ foreach ( var grainAddress in toRemove )
290+ {
291+ DeregisterCore ( grainAddress ) ;
292+ }
292293 }
293294 }
294295
@@ -490,10 +491,13 @@ private async Task AcquireRangeAsync(DirectoryMembershipSnapshot previous, Direc
490491 var recovered = false ;
491492 if ( ! success )
492493 {
493- // We pessimistically asssume if snapshot transfer failed, than safety is needed.
494- var expiration = _timeProvider . GetUtcNow ( ) . UtcDateTime . Add ( _leaseHoldDuration ) ;
495- _rangeLeaseHolds . Add ( ( addedRange , expiration ) ) ;
496- LogWarningLeaseHoldForRange ( _logger , addedRange , expiration ) ;
494+ if ( _leaseHoldDuration > TimeSpan . Zero )
495+ {
496+ // We pessimistically asssume if snapshot transfer failed, than safety is needed.
497+ var expiration = _timeProvider . GetUtcNow ( ) . UtcDateTime . Add ( _leaseHoldDuration ) ;
498+ _rangeLeaseHolds . Add ( ( addedRange , expiration ) ) ;
499+ LogWarningLeaseHoldForRange ( _logger , addedRange , expiration ) ;
500+ }
497501
498502 // Wait for previous versions to be unlocked before proceeding.
499503 await WaitForRange ( addedRange , previous . Version ) ;
@@ -804,21 +808,31 @@ private void CleanupExpiredLeasesCore()
804808 }
805809 }
806810
807- if ( _grainLeaseHolds . Count > 0 )
811+ if ( _siloLeaseHolds . Count > 0 )
808812 {
809- var expiredKeys = _grainLeaseHolds
810- . Where ( kvp => utcNow >= kvp . Value . LeaseExpiration )
813+ var expiredSilos = _siloLeaseHolds
814+ . Where ( kvp => utcNow >= kvp . Value )
811815 . Select ( kvp => kvp . Key )
812816 . ToList ( ) ;
813817
814- foreach ( var key in expiredKeys )
818+ if ( expiredSilos . Count > 0 )
815819 {
816- _grainLeaseHolds . Remove ( key ) ;
817- }
820+ // These are the grains which we were supposed to have removed when the silo was marked as dead,
821+ // but we kept them around until we were sure the silo was actually dead.
818822
819- if ( expiredKeys . Count > 0 )
820- {
821- LogDebugPrunedExpiredGrainLeaseHolds ( _logger , expiredKeys . Count ) ;
823+ var toRemove = _directory . Where ( kvp => expiredSilos . Contains ( kvp . Value . SiloAddress ! ) ) . ToList ( ) ;
824+
825+ foreach ( var kvp in toRemove )
826+ {
827+ _directory . Remove ( kvp . Key ) ;
828+ }
829+
830+ foreach ( var silo in expiredSilos )
831+ {
832+ _siloLeaseHolds . Remove ( silo ) ;
833+ }
834+
835+ LogDebugPrunedExpiredSiloLeaseHolds ( _logger , expiredSilos . Count , toRemove . Count ) ;
822836 }
823837 }
824838 }
@@ -834,21 +848,20 @@ private sealed record class PartitionSnapshotState(
834848
835849 [ LoggerMessage (
836850 Level = LogLevel . Debug ,
837- Message = "Grain {GrainId} from silo {Silo} has been put under a lease until {Expiration}."
838- ) ]
839- private static partial void LogDebugLeaseHoldForGrain ( ILogger logger , GrainId grainId , SiloAddress silo , DateTime expiration ) ;
851+ Message = "Placed lease hold on dead silo {SiloAddress} until {Expiration}." ) ]
852+ private static partial void LogDebugLeaseHoldForSilo ( ILogger logger , SiloAddress siloAddress , DateTime expiration ) ;
853+
854+ [ LoggerMessage (
855+ Level = LogLevel . Debug ,
856+ Message = "Pruned {SiloCount} expired silo lease holds, removing {GrainCount} dead grain activations from the directory." ) ]
857+ private static partial void LogDebugPrunedExpiredSiloLeaseHolds ( ILogger logger , int siloCount , int grainCount ) ;
840858
841859 [ LoggerMessage (
842860 Level = LogLevel . Warning ,
843861 Message = "Grains in the range {Range} have been put under a lease until {Expiration}."
844862 ) ]
845863 private static partial void LogWarningLeaseHoldForRange ( ILogger logger , RingRange range , DateTime expiration ) ;
846864
847- [ LoggerMessage (
848- Level = LogLevel . Debug ,
849- Message = "Pruned {Count} expired grain lease holds."
850- ) ]
851- private static partial void LogDebugPrunedExpiredGrainLeaseHolds ( ILogger logger , int count ) ;
852865
853866 [ LoggerMessage (
854867 Level = LogLevel . Debug ,
0 commit comments