Skip to content

Commit 7ff6e8d

Browse files
ledjon-behluliReubenBond
authored andcommitted
grain and range lease holds for even stronger single activation gurantees
1 parent 96b0088 commit 7ff6e8d

File tree

7 files changed

+434
-115
lines changed

7 files changed

+434
-115
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
namespace Orleans.Runtime;
2+
3+
/// <summary>
4+
/// Indicates that a request to the grain directory was rejected because the target grain or directory range
5+
/// is currently under a safety lease hold after an ungraceful silo crash.
6+
/// </summary>
7+
/// <remarks>
8+
/// Initializes a new instance of the <see cref="DirectoryLeaseHoldException"/> class.
9+
/// </remarks>
10+
/// <param name="message">The message.</param>
11+
[Serializable]
12+
[GenerateSerializer]
13+
[Alias("DirectoryLeaseHoldException")]
14+
public class DirectoryLeaseHoldException(string message) : OrleansException(message);
Lines changed: 113 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1,99 +1,119 @@
1-
2-
using System;
31
using Orleans.Runtime.GrainDirectory;
42

5-
namespace Orleans.Configuration
3+
namespace Orleans.Configuration;
4+
5+
public class GrainDirectoryOptions
66
{
7-
public class GrainDirectoryOptions
7+
/// <summary>
8+
/// Configuration type that controls the type of the grain directory caching algorithm that silo use.
9+
/// </summary>
10+
public enum CachingStrategyType
811
{
9-
/// <summary>
10-
/// Configuration type that controls the type of the grain directory caching algorithm that silo use.
11-
/// </summary>
12-
public enum CachingStrategyType
13-
{
14-
/// <summary>Don't cache.</summary>
15-
None,
16-
/// <summary>Standard fixed-size LRU.</summary>
17-
LRU,
18-
/// <summary>Adaptive caching with fixed maximum size and refresh. This option should be used in production.</summary>
19-
[Obsolete("Adaptive caching is deprecated in favor of LRU and will be removed in a future version. This value is now an alias for LRU.")]
20-
Adaptive,
21-
/// <summary>Custom cache implementation, configured by registering an <see cref="IGrainDirectoryCache"/> implementation in the dependency injection container.</summary>
22-
Custom
23-
}
24-
25-
/// <summary>
26-
/// Gets or sets the caching strategy to use.
27-
/// The options are None, which means don't cache directory entries locally;
28-
/// LRU, which indicates that a standard fixed-size least recently used strategy should be used; and
29-
/// Adaptive, which indicates that an adaptive strategy with a fixed maximum size should be used.
30-
/// The LRU strategy is used by default.
31-
/// </summary>
32-
public CachingStrategyType CachingStrategy { get; set; } = DEFAULT_CACHING_STRATEGY;
33-
34-
/// <summary>
35-
/// The default value for <see cref="CachingStrategy"/>.
36-
/// </summary>
37-
public const CachingStrategyType DEFAULT_CACHING_STRATEGY = CachingStrategyType.LRU;
38-
39-
/// <summary>
40-
/// Gets or sets the maximum number of grains to cache directory information for.
41-
/// </summary>
42-
public int CacheSize { get; set; } = DEFAULT_CACHE_SIZE;
43-
44-
/// <summary>
45-
/// The default value for <see cref="CacheSize"/>.
46-
/// </summary>
47-
public const int DEFAULT_CACHE_SIZE = 1_000_000;
48-
49-
/// <summary>
50-
/// Gets or sets the initial (minimum) time, in seconds, to keep a cache entry before revalidating.
51-
/// </summary>
52-
[Obsolete("InitialCacheTTL is deprecated and will be removed in a future version.")]
53-
public TimeSpan InitialCacheTTL { get; set; } = DEFAULT_INITIAL_CACHE_TTL;
54-
55-
/// <summary>
56-
/// The default value for <see cref="InitialCacheTTL"/>.
57-
/// </summary>
58-
[Obsolete("DEFAULT_INITIAL_CACHE_TTL is deprecated and will be removed in a future version.")]
59-
public static readonly TimeSpan DEFAULT_INITIAL_CACHE_TTL = TimeSpan.FromSeconds(30);
60-
61-
/// <summary>
62-
/// Gets or sets the maximum time, in seconds, to keep a cache entry before revalidating.
63-
/// </summary>
64-
[Obsolete("MaximumCacheTTL is deprecated and will be removed in a future version.")]
65-
public TimeSpan MaximumCacheTTL { get; set; } = DEFAULT_MAXIMUM_CACHE_TTL;
66-
67-
/// <summary>
68-
/// The default value for <see cref="MaximumCacheTTL"/>.
69-
/// </summary>
70-
[Obsolete("DEFAULT_MAXIMUM_CACHE_TTL is deprecated and will be removed in a future version.")]
71-
public static readonly TimeSpan DEFAULT_MAXIMUM_CACHE_TTL = TimeSpan.FromSeconds(240);
72-
73-
/// <summary>
74-
/// Gets or sets the factor by which cache entry TTLs should be extended when they are found to be stable.
75-
/// </summary>
76-
[Obsolete("CacheTTLExtensionFactor is deprecated and will be removed in a future version.")]
77-
public double CacheTTLExtensionFactor { get; set; } = DEFAULT_TTL_EXTENSION_FACTOR;
78-
79-
/// <summary>
80-
/// The default value for <see cref="CacheTTLExtensionFactor"/>.
81-
/// </summary>
82-
[Obsolete("DEFAULT_TTL_EXTENSION_FACTOR is deprecated and will be removed in a future version.")]
83-
public const double DEFAULT_TTL_EXTENSION_FACTOR = 2.0;
84-
85-
/// <summary>
86-
/// Gets or sets the time span between when we have added an entry for an activation to the grain directory and when we are allowed
87-
/// to conditionally remove that entry.
88-
/// Conditional deregistration is used for lazy clean-up of activations whose prompt deregistration failed for some reason (e.g., message failure).
89-
/// This should always be at least one minute, since we compare the times on the directory partition, so message delays and clcks skues have
90-
/// to be allowed.
91-
/// </summary>
92-
public TimeSpan LazyDeregistrationDelay { get; set; } = DEFAULT_UNREGISTER_RACE_DELAY;
93-
94-
/// <summary>
95-
/// The default value for <see cref="LazyDeregistrationDelay"/>.
96-
/// </summary>
97-
public static readonly TimeSpan DEFAULT_UNREGISTER_RACE_DELAY = TimeSpan.FromMinutes(1);
12+
/// <summary>Don't cache.</summary>
13+
None,
14+
/// <summary>Standard fixed-size LRU.</summary>
15+
LRU,
16+
/// <summary>Adaptive caching with fixed maximum size and refresh. This option should be used in production.</summary>
17+
[Obsolete("Adaptive caching is deprecated in favor of LRU and will be removed in a future version. This value is now an alias for LRU.")]
18+
Adaptive,
19+
/// <summary>Custom cache implementation, configured by registering an <see cref="IGrainDirectoryCache"/> implementation in the dependency injection container.</summary>
20+
Custom
9821
}
22+
23+
/// <summary>
24+
/// Gets or sets the caching strategy to use.
25+
/// The options are None, which means don't cache directory entries locally;
26+
/// LRU, which indicates that a standard fixed-size least recently used strategy should be used; and
27+
/// Adaptive, which indicates that an adaptive strategy with a fixed maximum size should be used.
28+
/// The LRU strategy is used by default.
29+
/// </summary>
30+
public CachingStrategyType CachingStrategy { get; set; } = DEFAULT_CACHING_STRATEGY;
31+
32+
/// <summary>
33+
/// The default value for <see cref="CachingStrategy"/>.
34+
/// </summary>
35+
public const CachingStrategyType DEFAULT_CACHING_STRATEGY = CachingStrategyType.LRU;
36+
37+
/// <summary>
38+
/// Gets or sets the maximum number of grains to cache directory information for.
39+
/// </summary>
40+
public int CacheSize { get; set; } = DEFAULT_CACHE_SIZE;
41+
42+
/// <summary>
43+
/// The default value for <see cref="CacheSize"/>.
44+
/// </summary>
45+
public const int DEFAULT_CACHE_SIZE = 1_000_000;
46+
47+
/// <summary>
48+
/// Gets or sets the initial (minimum) time, in seconds, to keep a cache entry before revalidating.
49+
/// </summary>
50+
[Obsolete("InitialCacheTTL is deprecated and will be removed in a future version.")]
51+
public TimeSpan InitialCacheTTL { get; set; } = DEFAULT_INITIAL_CACHE_TTL;
52+
53+
/// <summary>
54+
/// The default value for <see cref="InitialCacheTTL"/>.
55+
/// </summary>
56+
[Obsolete("DEFAULT_INITIAL_CACHE_TTL is deprecated and will be removed in a future version.")]
57+
public static readonly TimeSpan DEFAULT_INITIAL_CACHE_TTL = TimeSpan.FromSeconds(30);
58+
59+
/// <summary>
60+
/// Gets or sets the maximum time, in seconds, to keep a cache entry before revalidating.
61+
/// </summary>
62+
[Obsolete("MaximumCacheTTL is deprecated and will be removed in a future version.")]
63+
public TimeSpan MaximumCacheTTL { get; set; } = DEFAULT_MAXIMUM_CACHE_TTL;
64+
65+
/// <summary>
66+
/// The default value for <see cref="MaximumCacheTTL"/>.
67+
/// </summary>
68+
[Obsolete("DEFAULT_MAXIMUM_CACHE_TTL is deprecated and will be removed in a future version.")]
69+
public static readonly TimeSpan DEFAULT_MAXIMUM_CACHE_TTL = TimeSpan.FromSeconds(240);
70+
71+
/// <summary>
72+
/// Gets or sets the factor by which cache entry TTLs should be extended when they are found to be stable.
73+
/// </summary>
74+
[Obsolete("CacheTTLExtensionFactor is deprecated and will be removed in a future version.")]
75+
public double CacheTTLExtensionFactor { get; set; } = DEFAULT_TTL_EXTENSION_FACTOR;
76+
77+
/// <summary>
78+
/// The default value for <see cref="CacheTTLExtensionFactor"/>.
79+
/// </summary>
80+
[Obsolete("DEFAULT_TTL_EXTENSION_FACTOR is deprecated and will be removed in a future version.")]
81+
public const double DEFAULT_TTL_EXTENSION_FACTOR = 2.0;
82+
83+
/// <summary>
84+
/// Gets or sets the time span between when we have added an entry for an activation to the grain directory and when we are allowed
85+
/// to conditionally remove that entry.
86+
/// Conditional deregistration is used for lazy clean-up of activations whose prompt deregistration failed for some reason (e.g., message failure).
87+
/// This should always be at least one minute, since we compare the times on the directory partition, so message delays and clcks skues have
88+
/// to be allowed.
89+
/// </summary>
90+
public TimeSpan LazyDeregistrationDelay { get; set; } = DEFAULT_UNREGISTER_RACE_DELAY;
91+
92+
/// <summary>
93+
/// The default value for <see cref="LazyDeregistrationDelay"/>.
94+
/// </summary>
95+
public static readonly TimeSpan DEFAULT_UNREGISTER_RACE_DELAY = TimeSpan.FromMinutes(1);
96+
97+
/// <summary>
98+
/// Gets or sets the duration for the safety lease hold applied after an ungraceful silo failure.
99+
/// This duration applies in two scenarios:
100+
/// <list type="bullet">
101+
/// <item>When a specific silo crashes ungracefully, grain lease holds prevent individual re-registration of its grains for this duration.</item>
102+
/// <item>When a directory partition can not acquire a snapshot from a previous owner, range lease holds prevent new registrations in that whole range for this duration.</item>
103+
/// </list>
104+
/// </summary>
105+
/// <remarks>
106+
/// Depending on the value of this, the duration is understood as:
107+
/// <list type="bullet">
108+
/// <item><c>SafetyLeaseHoldDuration > TimeSpan.Zero</c> - The lease duration is explicitly controlled by the user.</item>
109+
/// <item><c>SafetyLeaseHoldDuration &lt;= TimeSpan.Zero</c> - The system computes a lease duration as:
110+
/// <c>2 × <see cref="ClusterMembershipOptions.ProbeTimeout"/> × <see cref="ClusterMembershipOptions.NumMissedProbesLimit"/></c>.</item>
111+
/// </list>
112+
/// </remarks>
113+
public TimeSpan SafetyLeaseHoldDuration { get; set; }
114+
115+
/// <summary>
116+
/// The default value for <see cref="SafetyLeaseHoldDuration"/>
117+
/// </summary>
118+
public static readonly TimeSpan DEFAULT_SAFETY_LEASE_HOLD_DURATION = TimeSpan.Zero;
99119
}

src/Orleans.Runtime/GrainDirectory/DistributedGrainDirectory.cs

Lines changed: 61 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
using System;
2-
using System.Collections.Generic;
31
using System.Collections.Immutable;
4-
using System.Linq;
52
using System.Runtime.CompilerServices;
6-
using System.Threading;
7-
using System.Threading.Tasks;
3+
using System.Timers;
84
using Microsoft.Extensions.DependencyInjection;
95
using Microsoft.Extensions.Logging;
6+
using Microsoft.Extensions.Options;
107
using Orleans.Concurrency;
8+
using Orleans.Configuration;
119
using Orleans.GrainDirectory;
1210
using Orleans.Internal;
1311
using Orleans.Runtime.Internal;
@@ -63,6 +61,7 @@ internal sealed partial class DistributedGrainDirectory : SystemTarget, IGrainDi
6361
private readonly IServiceProvider _serviceProvider;
6462
private readonly ImmutableArray<GrainDirectoryPartition> _partitions;
6563
private readonly CancellationTokenSource _stoppedCts = new();
64+
private readonly TimeSpan _leaseHoldDuration;
6665

6766
internal CancellationToken OnStoppedToken => _stoppedCts.Token;
6867
internal ClusterMembershipSnapshot ClusterMembershipSnapshot => _membershipService.CurrentView.ClusterMembershipSnapshot;
@@ -75,25 +74,37 @@ internal sealed partial class DistributedGrainDirectory : SystemTarget, IGrainDi
7574
// precise by also tracking the sets of ranges which need to be recovered, but that complicates things somewhat since it would require tracking the ranges
7675
// for each recovery version.
7776
private long _recoveryMembershipVersion;
78-
private Task _runTask = Task.CompletedTask;
79-
private ActivationDirectory _localActivations;
77+
private readonly ActivationDirectory _localActivations;
8078
private GrainDirectoryResolver? _grainDirectoryResolver;
79+
private Task? _runTask;
80+
private Task? _leaseCleanupTask;
8181

8282
public DistributedGrainDirectory(
8383
DirectoryMembershipService membershipService,
8484
ILogger<DistributedGrainDirectory> logger,
8585
IServiceProvider serviceProvider,
8686
IInternalGrainFactory grainFactory,
87+
IOptions<GrainDirectoryOptions> directoryOptions,
88+
IOptions<ClusterMembershipOptions> membershipOptions,
89+
TimeProvider timeProvider,
8790
SystemTargetShared shared) : base(Constants.GrainDirectoryType, shared)
8891
{
8992
_localActivations = shared.ActivationDirectory;
9093
_serviceProvider = serviceProvider;
9194
_membershipService = membershipService;
9295
_logger = logger;
96+
_leaseHoldDuration = directoryOptions.Value.SafetyLeaseHoldDuration;
97+
98+
if (_leaseHoldDuration <= TimeSpan.Zero)
99+
{
100+
_leaseHoldDuration = 2 * membershipOptions.Value.ProbeTimeout * membershipOptions.Value.NumMissedProbesLimit;
101+
}
102+
93103
var partitions = ImmutableArray.CreateBuilder<GrainDirectoryPartition>(DirectoryMembershipSnapshot.PartitionsPerSilo);
104+
94105
for (var i = 0; i < DirectoryMembershipSnapshot.PartitionsPerSilo; i++)
95106
{
96-
partitions.Add(new GrainDirectoryPartition(i, this, grainFactory, shared));
107+
partitions.Add(new GrainDirectoryPartition(i, this, _leaseHoldDuration, grainFactory, timeProvider, shared));
97108
}
98109

99110
_partitions = partitions.ToImmutable();
@@ -312,19 +323,27 @@ void ILifecycleParticipant<ISiloLifecycle>.Participate(ISiloLifecycle observer)
312323
Task OnRuntimeInitializeStart(CancellationToken cancellationToken)
313324
{
314325
using var _ = new ExecutionContextSuppressor();
315-
WorkItemGroup.QueueAction(() => _runTask = ProcessMembershipUpdates());
326+
327+
WorkItemGroup.QueueAction(() =>
328+
{
329+
_runTask = ProcessMembershipUpdates();
330+
_leaseCleanupTask = RequestExpiredLeaseCleanups();
331+
});
316332

317333
return Task.CompletedTask;
318334
}
319335

320336
async Task OnRuntimeInitializeStop(CancellationToken cancellationToken)
321337
{
322338
_stoppedCts.Cancel();
339+
323340
if (_runTask is { } task)
324341
{
325342
// Try to wait for hand-off to complete.
326343
await this.RunOrQueueTask(async () => await task.WaitAsync(cancellationToken).SuppressThrowing());
327344
}
345+
346+
// No need to wait on the cleanup task since it does not have any external effects.
328347
}
329348

330349
async Task OnShuttingDown(CancellationToken token)
@@ -359,9 +378,10 @@ private async Task ProcessMembershipUpdates()
359378
{
360379
if (change.Status == SiloStatus.Dead)
361380
{
381+
var previousStatus = previousUpdate.GetSiloStatus(change.SiloAddress);
362382
foreach (var partition in _partitions)
363383
{
364-
tasks.Add(partition.OnSiloRemovedFromClusterAsync(change));
384+
tasks.Add(partition.OnSiloRemovedFromClusterAsync(change, previousStatus));
365385
}
366386
}
367387
}
@@ -396,6 +416,37 @@ private async Task ProcessMembershipUpdates()
396416
await Task.WhenAll(tasks).SuppressThrowing();
397417
}
398418

419+
private async Task RequestExpiredLeaseCleanups()
420+
{
421+
// We request cleanups periodically to not let expired leases linger in the directory for too long.
422+
// We do it here as opposed to in the partitions to avoid having 30 (by default, maybe more) timers.
423+
424+
var period = 1.1 * _leaseHoldDuration;
425+
426+
if (period < TimeSpan.FromMinutes(1))
427+
{
428+
// We create a lower-bound to avoid creting too much overhead in the partitions.
429+
period = TimeSpan.FromMinutes(1);
430+
}
431+
432+
using var timer = new PeriodicTimer(period);
433+
434+
try
435+
{
436+
while (await timer.WaitForNextTickAsync(_stoppedCts.Token))
437+
{
438+
foreach (var partition in _partitions)
439+
{
440+
partition.CleanupExpiredLeases();
441+
}
442+
}
443+
}
444+
catch (OperationCanceledException) when (_stoppedCts.IsCancellationRequested)
445+
{
446+
// Ignore
447+
}
448+
}
449+
399450
SiloAddress? ITestHooks.GetPrimaryForGrain(GrainId grainId)
400451
{
401452
_membershipService.CurrentView.TryGetOwner(grainId, out var owner, out _);

0 commit comments

Comments
 (0)