33using System . Collections . Immutable ;
44using System . ComponentModel ;
55using System . Diagnostics . CodeAnalysis ;
6+ using System . Threading ;
67using System . Threading . Tasks ;
7- using Microsoft . Extensions . DependencyInjection ;
88using Microsoft . Extensions . Logging ;
99using Microsoft . Extensions . Options ;
1010using Orleans . Configuration ;
@@ -20,15 +20,15 @@ internal sealed partial class LocalGrainDirectory : ILocalGrainDirectory, ISiloS
2020 private readonly SiloAddress ? seed ;
2121 private readonly ISiloStatusOracle siloStatusOracle ;
2222 private readonly IInternalGrainFactory grainFactory ;
23+ private readonly ActivationDirectory activations ;
24+ private readonly GrainDirectoryResolver grainDirectoryResolver ;
2325 private readonly object writeLock = new object ( ) ;
24- private readonly IServiceProvider _serviceProvider ;
2526 private DirectoryMembership directoryMembership = DirectoryMembership . Default ;
2627
2728 // Consider: move these constants into an appropriate place
2829 internal const int HOP_LIMIT = 6 ; // forward a remote request no more than 5 times
2930 public static readonly TimeSpan RETRY_DELAY = TimeSpan . FromMilliseconds ( 200 ) ; // Pause 200ms between forwards to let the membership directory settle down
3031 internal bool Running ;
31- private Catalog ? _catalog ;
3232
3333 internal SiloAddress MyAddress { get ; }
3434
@@ -45,6 +45,8 @@ public LocalGrainDirectory(
4545 ILocalSiloDetails siloDetails ,
4646 ISiloStatusOracle siloStatusOracle ,
4747 IInternalGrainFactory grainFactory ,
48+ ActivationDirectory activationDirectory ,
49+ GrainDirectoryResolver grainDirectoryResolver ,
4850 IOptions < DevelopmentClusterMembershipOptions > developmentClusterMembershipOptions ,
4951 IOptions < GrainDirectoryOptions > grainDirectoryOptions ,
5052 ILoggerFactory loggerFactory ,
@@ -57,6 +59,8 @@ public LocalGrainDirectory(
5759
5860 this . siloStatusOracle = siloStatusOracle ;
5961 this . grainFactory = grainFactory ;
62+ this . activations = activationDirectory ;
63+ this . grainDirectoryResolver = grainDirectoryResolver ;
6064
6165 DirectoryCache = GrainDirectoryCacheFactory . CreateGrainDirectoryCache ( serviceProvider , grainDirectoryOptions . Value ) ;
6266
@@ -84,7 +88,6 @@ public LocalGrainDirectory(
8488 return ring . Count == 0 ? 0 : ( ( float ) 100 / ( float ) ring . Count ) ;
8589 } ) ;
8690 DirectoryInstruments . RegisterRingSizeObserve ( ( ) => this . directoryMembership . MembershipRingList . Count ) ;
87- _serviceProvider = serviceProvider ;
8891 }
8992
9093 public void Start ( )
@@ -151,26 +154,21 @@ private void AddServer(SiloAddress silo)
151154
152155 private void RemoveServer ( SiloAddress silo , SiloStatus status )
153156 {
157+ List < IGrainContext > ? activationsToShutdown = null ;
158+
154159 lock ( this . writeLock )
155160 {
156- try
157- {
158- // Only notify the catalog once. Order is important: call BEFORE updating membershipRingList.
159- _catalog = _serviceProvider . GetRequiredService < Catalog > ( ) ;
160- _catalog . OnSiloStatusChange ( this , silo , status ) ;
161- }
162- catch ( Exception exc )
163- {
164- LogErrorCatalogSiloStatusChangeNotificationException ( exc , new ( silo ) ) ;
165- }
166-
167161 var existing = this . directoryMembership ;
168162 if ( ! existing . MembershipCache . Contains ( silo ) )
169163 {
170164 // we have already removed this silo
171165 return ;
172166 }
173167
168+ // Collect activations to deactivate BEFORE updating membershipRingList,
169+ // since GetPrimaryForGrain depends on the current ring membership.
170+ activationsToShutdown = CollectActivationsToDeactivate ( silo ) ;
171+
174172 this . directoryMembership = new DirectoryMembership (
175173 existing . MembershipRingList . Remove ( silo ) ,
176174 existing . MembershipCache . Remove ( silo ) ) ;
@@ -180,6 +178,64 @@ private void RemoveServer(SiloAddress silo, SiloStatus status)
180178
181179 LogDebugSiloRemovedSilo ( MyAddress , silo ) ;
182180 }
181+
182+ // Deactivate activations outside the lock
183+ if ( activationsToShutdown is { Count : > 0 } )
184+ {
185+ var reasonText = $ "This activation is being deactivated due to a failure of server { silo } , since it was responsible for this activation's grain directory registration.";
186+ var reason = new DeactivationReason ( DeactivationReasonCode . DirectoryFailure , reasonText ) ;
187+ DeactivateActivations ( reason , activationsToShutdown ) ;
188+ }
189+ }
190+
191+ /// <summary>
192+ /// Collects activations that should be deactivated because the removed silo was their primary directory partition owner.
193+ /// </summary>
194+ private List < IGrainContext > CollectActivationsToDeactivate ( SiloAddress removedSilo )
195+ {
196+ var result = new List < IGrainContext > ( ) ;
197+ try
198+ {
199+ // Scan all activations and find those that the removed silo is their primary partition owner.
200+ foreach ( var activation in activations )
201+ {
202+ try
203+ {
204+ var activationData = activation . Value ;
205+ var placementStrategy = activationData . GetComponent < PlacementStrategy > ( ) ;
206+ var isUsingGrainDirectory = placementStrategy is { IsUsingGrainDirectory : true } ;
207+ if ( ! isUsingGrainDirectory || ! grainDirectoryResolver . IsUsingDhtGrainDirectory ( activationData . GrainId . Type ) ) continue ;
208+ if ( ! removedSilo . Equals ( GetPrimaryForGrain ( activationData . GrainId ) ) ) continue ;
209+
210+ result . Add ( activationData ) ;
211+ }
212+ catch ( Exception exc )
213+ {
214+ LogErrorCollectActivationsToDeactivate ( new ( removedSilo ) , exc ) ;
215+ }
216+ }
217+
218+ if ( result . Count > 0 )
219+ {
220+ LogInfoDeactivatingActivationsDueToSiloFailure ( result . Count , new ( removedSilo ) ) ;
221+ }
222+ }
223+ catch ( Exception exc )
224+ {
225+ LogErrorCollectActivationsToDeactivate ( new ( removedSilo ) , exc ) ;
226+ }
227+
228+ return result ;
229+ }
230+
231+ private void DeactivateActivations ( DeactivationReason reason , List < IGrainContext > activations )
232+ {
233+ LogDebugDeactivateActivations ( activations . Count ) ;
234+
235+ foreach ( var activation in activations )
236+ {
237+ activation . Deactivate ( reason , CancellationToken . None ) ;
238+ }
183239 }
184240
185241 /// <summary>
@@ -826,9 +882,22 @@ private readonly struct SiloHashLogValue(SiloAddress? silo)
826882 [ LoggerMessage (
827883 EventId = ( int ) ErrorCode . Directory_SiloStatusChangeNotification_Exception ,
828884 Level = LogLevel . Error ,
829- Message = "CatalogSiloStatusListener.SiloStatusChangeNotification has thrown an exception when notified about removed silo {Silo}."
885+ Message = "Exception while collecting activations to deactivate after removal of silo {Silo}."
886+ ) ]
887+ private partial void LogErrorCollectActivationsToDeactivate ( SiloAddressLogValue silo , Exception exception ) ;
888+
889+ [ LoggerMessage (
890+ EventId = ( int ) ErrorCode . Catalog_SiloStatusChangeNotification ,
891+ Level = LogLevel . Information ,
892+ Message = "Deactivating {Count} activations due to failure of silo {Silo}, since it was the primary directory partition for these grain ids."
893+ ) ]
894+ private partial void LogInfoDeactivatingActivationsDueToSiloFailure ( int count , SiloAddressLogValue silo ) ;
895+
896+ [ LoggerMessage (
897+ Level = LogLevel . Debug ,
898+ Message = "DeactivateActivations: {Count} activations."
830899 ) ]
831- private partial void LogErrorCatalogSiloStatusChangeNotificationException ( Exception exception , SiloAddressLogValue silo ) ;
900+ private partial void LogDebugDeactivateActivations ( int count ) ;
832901
833902 [ LoggerMessage (
834903 Level = LogLevel . Debug ,
0 commit comments