Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions src/contrib/cluster/Akka.Cluster.Sharding/ShardCoordinator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1790,6 +1790,31 @@ internal bool Active(object message)
}
return true;

case ShardStopped m:
// Ported from Pekko: clean up unAckedHostShards when shard reports stopped
if (_unAckedHostShards.TryGetValue(m.Shard, out var stopCancel))
{
stopCancel.Cancel();
_unAckedHostShards = _unAckedHostShards.Remove(m.Shard);
}

// Safety net: if no rebalance is in progress for this shard (RebalanceWorker
// already timed out), deallocate the shard so it can be reallocated elsewhere.
// This prevents the shard from being endlessly recreated via GetShardHome/ShardHome.
if (!_rebalanceInProgress.ContainsKey(m.Shard) && State.Shards.ContainsKey(m.Shard))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have seen a ton of GetShardHome / ShardHome spam in my apps and I assumed it was Phobos' sharding metric polling responsible for that. Apparently this bug is also a big contributor.

{
Log.Info("{0}: Shard [{1}] stopped - performing late deallocation (rebalance worker timed out).",
TypeName, m.Shard);
Update(new ShardHomeDeallocated(m.Shard), evt =>
{
State = State.Updated(evt);
Log.Debug("{0}: Shard [{1}] deallocated (late)", TypeName, m.Shard);
AllocateShardHomesForRememberEntities();
_context.Self.Tell(new GetShardHome(m.Shard), _ignoreRef);
});
}
return true;

case ResendShardHost m:
{
if (State.Shards.TryGetValue(m.Shard, out var region) && region.Equals(region))
Expand Down
6 changes: 6 additions & 0 deletions src/contrib/cluster/Akka.Cluster.Sharding/ShardRegion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1410,6 +1410,12 @@ private void HandleTerminated(Terminated terminated)
{
_handingOff = _handingOff.Remove(terminated.ActorRef);
_log.Debug("{0}: Shard [{1}] handoff complete", _typeName, shard);

// Send backup ShardStopped to coordinator in case the RebalanceWorker
// has already timed out and missed the ShardStopped from HandOffStopper.
// The coordinator's Active handler will only deallocate if no rebalance
// is currently in progress for this shard.
_coordinator?.Tell(new ShardCoordinator.ShardStopped(shard));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

allows us to double-tap the ShardStopped message handling in case the RebalanceWorker has died already

}
else
{
Expand Down
Loading