Skip to content

Commit 387b230

Browse files
ledjon-behluliReubenBond
authored andcommitted
address PR feedback
1 parent 7ff6e8d commit 387b230

File tree

5 files changed

+110
-91
lines changed

5 files changed

+110
-91
lines changed

src/Orleans.Runtime/Configuration/Options/GrainDirectoryOptions.cs

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -106,14 +106,12 @@ public enum CachingStrategyType
106106
/// Depending on the value of this, the duration is understood as:
107107
/// <list type="bullet">
108108
/// <item><c>SafetyLeaseHoldDuration > TimeSpan.Zero</c> - The lease duration is explicitly controlled by the user.</item>
109-
/// <item><c>SafetyLeaseHoldDuration &lt;= TimeSpan.Zero</c> - The system computes a lease duration as:
110-
/// <c>2 × <see cref="ClusterMembershipOptions.ProbeTimeout"/> × <see cref="ClusterMembershipOptions.NumMissedProbesLimit"/></c>.</item>
109+
/// <item><c>SafetyLeaseHoldDuration = TimeSpan.Zero</c>. No leases are placed placed at all, effectively nullifying this safety option.</item>
110+
/// <item><c>SafetyLeaseHoldDuration = null</c> - The system computes a lease duration as:
111+
/// <c>2 × <see cref="ClusterMembershipOptions.ProbeTimeout"/> × <see cref="ClusterMembershipOptions.NumMissedProbesLimit"/></c>.
112+
/// This is the default value, and is designed to be long enough to allow for failure detection and cluster stabilization.
113+
/// </item>
111114
/// </list>
112115
/// </remarks>
113-
public TimeSpan SafetyLeaseHoldDuration { get; set; }
114-
115-
/// <summary>
116-
/// The default value for <see cref="SafetyLeaseHoldDuration"/>
117-
/// </summary>
118-
public static readonly TimeSpan DEFAULT_SAFETY_LEASE_HOLD_DURATION = TimeSpan.Zero;
116+
public TimeSpan? SafetyLeaseHoldDuration { get; set; }
119117
}

src/Orleans.Runtime/GrainDirectory/DistributedGrainDirectory.cs

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System.Collections.Immutable;
2+
using System.Diagnostics;
23
using System.Runtime.CompilerServices;
34
using System.Timers;
45
using Microsoft.Extensions.DependencyInjection;
@@ -93,12 +94,13 @@ public DistributedGrainDirectory(
9394
_serviceProvider = serviceProvider;
9495
_membershipService = membershipService;
9596
_logger = logger;
96-
_leaseHoldDuration = directoryOptions.Value.SafetyLeaseHoldDuration;
9797

98-
if (_leaseHoldDuration <= TimeSpan.Zero)
98+
_leaseHoldDuration = directoryOptions.Value.SafetyLeaseHoldDuration switch
9999
{
100-
_leaseHoldDuration = 2 * membershipOptions.Value.ProbeTimeout * membershipOptions.Value.NumMissedProbesLimit;
101-
}
100+
null => 2 * membershipOptions.Value.ProbeTimeout * membershipOptions.Value.NumMissedProbesLimit,
101+
TimeSpan duration when duration >= TimeSpan.Zero => duration,
102+
_ => throw new InvalidOperationException("Lease hold duration must be non-negative.")
103+
};
102104

103105
var partitions = ImmutableArray.CreateBuilder<GrainDirectoryPartition>(DirectoryMembershipSnapshot.PartitionsPerSilo);
104106

@@ -327,7 +329,11 @@ Task OnRuntimeInitializeStart(CancellationToken cancellationToken)
327329
WorkItemGroup.QueueAction(() =>
328330
{
329331
_runTask = ProcessMembershipUpdates();
330-
_leaseCleanupTask = RequestExpiredLeaseCleanups();
332+
333+
if (_leaseHoldDuration > TimeSpan.Zero)
334+
{
335+
_leaseCleanupTask = RequestExpiredLeaseCleanups();
336+
}
331337
});
332338

333339
return Task.CompletedTask;
@@ -418,6 +424,8 @@ private async Task ProcessMembershipUpdates()
418424

419425
private async Task RequestExpiredLeaseCleanups()
420426
{
427+
Debug.Assert(_leaseHoldDuration > TimeSpan.Zero);
428+
421429
// We request cleanups periodically to not let expired leases linger in the directory for too long.
422430
// We do it here as opposed to in the partitions to avoid having 30 (by default, maybe more) timers.
423431

src/Orleans.Runtime/GrainDirectory/GrainDirectoryPartition.Interface.cs

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
using System.Diagnostics;
12
using System.Runtime.InteropServices;
23
using Microsoft.Extensions.Logging;
34

@@ -19,46 +20,44 @@ async ValueTask<DirectoryResult<GrainAddress>> IGrainDirectoryPartition.Register
1920

2021
DebugAssertOwnership(address.GrainId);
2122

22-
var utcNow = _timeProvider.GetUtcNow().UtcDateTime;
23-
var rangeHash = address.GrainId.GetUniformHashCode();
24-
25-
// Range lease holds
26-
for (var i = _rangeLeaseHolds.Count - 1; i >= 0; i--)
23+
if (_leaseHoldDuration > TimeSpan.Zero)
2724
{
28-
var (lockedRange, expiration) = _rangeLeaseHolds[i];
25+
var utcNow = _timeProvider.GetUtcNow().UtcDateTime;
26+
var rangeHash = address.GrainId.GetUniformHashCode();
2927

30-
if (utcNow >= expiration)
28+
// Range lease holds
29+
for (var i = _rangeLeaseHolds.Count - 1; i >= 0; i--)
3130
{
32-
// We use this opportunity to cleanup this expired range lease hold.
33-
_rangeLeaseHolds.RemoveAt(i);
34-
continue;
35-
}
31+
var (lockedRange, expiration) = _rangeLeaseHolds[i];
3632

37-
// If it is still active, does it block this request?
38-
if (lockedRange.Contains(rangeHash))
39-
{
40-
// We reject, the client should retry!
41-
throw new DirectoryLeaseHoldException($"Range {lockedRange} is under a lease hold until {expiration - utcNow}.");
42-
}
43-
}
33+
if (utcNow >= expiration)
34+
{
35+
// We use this opportunity to cleanup this expired range lease hold.
36+
_rangeLeaseHolds.RemoveAt(i);
37+
continue;
38+
}
4439

45-
// Grain lease holds
46-
if (_grainLeaseHolds.TryGetValue(address.GrainId, out var tombstone))
47-
{
48-
if (utcNow >= tombstone.LeaseExpiration)
49-
{
50-
// We use this opportunity to cleanup this expired grain-specific lease hold.
51-
_grainLeaseHolds.Remove(address.GrainId);
40+
// If it is still active, does it block this request?
41+
if (lockedRange.Contains(rangeHash))
42+
{
43+
// We reject, the client should retry!
44+
throw new DirectoryLeaseHoldException($"Range {lockedRange} is under a lease hold until {expiration - utcNow}.");
45+
}
5246
}
53-
else
47+
48+
// Grain lease holds
49+
if (_directory.TryGetValue(address.GrainId, out var existingActivation))
5450
{
55-
// Is the new registration trying to point to the same dead silo?
56-
// If yes, it is consistent, but dead; otherwise we must block.
57-
if (!tombstone.DeadSilo.Equals(address.SiloAddress))
51+
if (_siloLeaseHolds.TryGetValue(existingActivation.SiloAddress!, out var expiration) && utcNow < expiration)
5852
{
59-
// The previous owner is dead, but the lease hasnt expired yet, we must reject.
60-
// We can not guarantee the old activation is gone yet. The client should retry!
61-
throw new DirectoryLeaseHoldException($"Grain {address.GrainId} is under a lease hold until {tombstone.LeaseExpiration - utcNow}.");
53+
// This grain belongs to this parition, and the activation is sitting on a silo that has an active lease hold.
54+
// We need to check if the request include sthe previous activation id, and if it does its a valid update/override,
55+
// otherwise it's a new activation trying to "steal" the id while the lease is active, so we reject it!
56+
57+
if (currentRegistration is null || !existingActivation.Matches(currentRegistration))
58+
{
59+
throw new DirectoryLeaseHoldException($"Silo {existingActivation.SiloAddress} is under a lease hold until {expiration - utcNow}.");
60+
}
6261
}
6362
}
6463
}

src/Orleans.Runtime/GrainDirectory/GrainDirectoryPartition.cs

Lines changed: 58 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ internal sealed partial class GrainDirectoryPartition : SystemTarget, IGrainDire
4646

4747
private readonly TimeSpan _leaseHoldDuration;
4848
private readonly List<(RingRange Range, DateTime Expiration)> _rangeLeaseHolds = [];
49-
private readonly Dictionary<GrainId, (SiloAddress DeadSilo, DateTime LeaseExpiration)> _grainLeaseHolds = [];
49+
private readonly Dictionary<SiloAddress, DateTime> _siloLeaseHolds = [];
5050

5151
/// <param name="partitionIndex">The index of this partition on this silo. Each silo hosts a fixed number of dynamically sized partitions.</param>
5252
public GrainDirectoryPartition(
@@ -258,37 +258,38 @@ private void OnSiloRemovedFromCluster(ClusterMember change, SiloStatus previousS
258258
// If it was ShuttingDown, it surrendered its ownership gracefully.
259259
// If it was Active (or Joining) and suddenly became Dead, it crashed.
260260

261-
var isUngraceful = previousStatus is not SiloStatus.ShuttingDown;
262-
var expiration = _timeProvider.GetUtcNow().UtcDateTime.Add(_leaseHoldDuration);
263-
var toRemove = new List<GrainAddress>();
261+
if (previousStatus is not SiloStatus.ShuttingDown && _leaseHoldDuration > TimeSpan.Zero)
262+
{
263+
// Instead of just deleting, we mark it as tombstoned.
264+
// This prevents a new activation on a healthy silo from registering
265+
// until we are sure the dead silo has actually stopped processing.
266+
267+
var expiration = _timeProvider.GetUtcNow().UtcDateTime.Add(_leaseHoldDuration);
264268

265-
foreach (var entry in _directory)
269+
_siloLeaseHolds[change.SiloAddress] = expiration;
270+
271+
LogDebugLeaseHoldForSilo(_logger, change.SiloAddress, expiration);
272+
}
273+
else
266274
{
267-
if (change.SiloAddress.Equals(entry.Value.SiloAddress))
268-
{
269-
toRemove.Add(entry.Value);
275+
var toRemove = new List<GrainAddress>();
270276

271-
if (isUngraceful)
277+
foreach (var entry in _directory)
278+
{
279+
if (change.SiloAddress.Equals(entry.Value.SiloAddress))
272280
{
273-
// Instead of just deleting, we mark it as tombstoned.
274-
// This prevents a new activation on a healthy silo from registering
275-
// until we are sure the dead silo has actually stopped processing.
276-
// So we block re-registration of this specific grain id if we did not already,
277-
// or extend based on the new expiration time.
278-
279-
_grainLeaseHolds[entry.Key] = (change.SiloAddress, expiration);
280-
LogDebugLeaseHoldForGrain(_logger, entry.Key, change.SiloAddress, expiration);
281+
toRemove.Add(entry.Value);
281282
}
282283
}
283-
}
284-
285-
if (toRemove.Count > 0)
286-
{
287-
LogDebugDeletingEntries(_logger, toRemove.Count, change.SiloAddress);
288284

289-
foreach (var grainAddress in toRemove)
285+
if (toRemove.Count > 0)
290286
{
291-
DeregisterCore(grainAddress);
287+
LogDebugDeletingEntries(_logger, toRemove.Count, change.SiloAddress);
288+
289+
foreach (var grainAddress in toRemove)
290+
{
291+
DeregisterCore(grainAddress);
292+
}
292293
}
293294
}
294295

@@ -490,10 +491,13 @@ private async Task AcquireRangeAsync(DirectoryMembershipSnapshot previous, Direc
490491
var recovered = false;
491492
if (!success)
492493
{
493-
// We pessimistically asssume if snapshot transfer failed, than safety is needed.
494-
var expiration = _timeProvider.GetUtcNow().UtcDateTime.Add(_leaseHoldDuration);
495-
_rangeLeaseHolds.Add((addedRange, expiration));
496-
LogWarningLeaseHoldForRange(_logger, addedRange, expiration);
494+
if (_leaseHoldDuration > TimeSpan.Zero)
495+
{
496+
// We pessimistically asssume if snapshot transfer failed, than safety is needed.
497+
var expiration = _timeProvider.GetUtcNow().UtcDateTime.Add(_leaseHoldDuration);
498+
_rangeLeaseHolds.Add((addedRange, expiration));
499+
LogWarningLeaseHoldForRange(_logger, addedRange, expiration);
500+
}
497501

498502
// Wait for previous versions to be unlocked before proceeding.
499503
await WaitForRange(addedRange, previous.Version);
@@ -804,21 +808,31 @@ private void CleanupExpiredLeasesCore()
804808
}
805809
}
806810

807-
if (_grainLeaseHolds.Count > 0)
811+
if (_siloLeaseHolds.Count > 0)
808812
{
809-
var expiredKeys = _grainLeaseHolds
810-
.Where(kvp => utcNow >= kvp.Value.LeaseExpiration)
813+
var expiredSilos = _siloLeaseHolds
814+
.Where(kvp => utcNow >= kvp.Value)
811815
.Select(kvp => kvp.Key)
812816
.ToList();
813817

814-
foreach (var key in expiredKeys)
818+
if (expiredSilos.Count > 0)
815819
{
816-
_grainLeaseHolds.Remove(key);
817-
}
820+
// These are the grains which we were supposed to have removed when the silo was marked as dead,
821+
// but we kept them around until we were sure the silo was actually dead.
818822

819-
if (expiredKeys.Count > 0)
820-
{
821-
LogDebugPrunedExpiredGrainLeaseHolds(_logger, expiredKeys.Count);
823+
var toRemove = _directory.Where(kvp => expiredSilos.Contains(kvp.Value.SiloAddress!)).ToList();
824+
825+
foreach (var kvp in toRemove)
826+
{
827+
_directory.Remove(kvp.Key);
828+
}
829+
830+
foreach (var silo in expiredSilos)
831+
{
832+
_siloLeaseHolds.Remove(silo);
833+
}
834+
835+
LogDebugPrunedExpiredSiloLeaseHolds(_logger, expiredSilos.Count, toRemove.Count);
822836
}
823837
}
824838
}
@@ -834,21 +848,20 @@ private sealed record class PartitionSnapshotState(
834848

835849
[LoggerMessage(
836850
Level = LogLevel.Debug,
837-
Message = "Grain {GrainId} from silo {Silo} has been put under a lease until {Expiration}."
838-
)]
839-
private static partial void LogDebugLeaseHoldForGrain(ILogger logger, GrainId grainId, SiloAddress silo, DateTime expiration);
851+
Message = "Placed lease hold on dead silo {SiloAddress} until {Expiration}.")]
852+
private static partial void LogDebugLeaseHoldForSilo(ILogger logger, SiloAddress siloAddress, DateTime expiration);
853+
854+
[LoggerMessage(
855+
Level = LogLevel.Debug,
856+
Message = "Pruned {SiloCount} expired silo lease holds, removing {GrainCount} dead grain activations from the directory.")]
857+
private static partial void LogDebugPrunedExpiredSiloLeaseHolds(ILogger logger, int siloCount, int grainCount);
840858

841859
[LoggerMessage(
842860
Level = LogLevel.Warning,
843861
Message = "Grains in the range {Range} have been put under a lease until {Expiration}."
844862
)]
845863
private static partial void LogWarningLeaseHoldForRange(ILogger logger, RingRange range, DateTime expiration);
846864

847-
[LoggerMessage(
848-
Level = LogLevel.Debug,
849-
Message = "Pruned {Count} expired grain lease holds."
850-
)]
851-
private static partial void LogDebugPrunedExpiredGrainLeaseHolds(ILogger logger, int count);
852865

853866
[LoggerMessage(
854867
Level = LogLevel.Debug,

test/Orleans.GrainDirectory.Tests/GrainDirectory/GrainDirectoryLeaseTests.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ public class LeaseTestGrain : Grain, ILeaseTestGrain
2424
public class GrainDirectoryLeaseTests
2525
{
2626
public static readonly FakeTimeProvider TimeProvider = new(DateTime.UtcNow);
27-
public static readonly TimeSpan LeaseHoldDuration = TimeSpan.FromSeconds(5); // The value doesnt matter we will advance time manually.
27+
// The value doesnt matter we will advance time manually.
28+
public static readonly TimeSpan LeaseHoldDuration = TimeSpan.FromSeconds(5);
2829

2930
[Fact]
3031
public async Task BlocksReactivations_AfterUngracefulShutdown()

0 commit comments

Comments
 (0)