@@ -11,6 +11,7 @@ package actors
1111
1212import (
1313 "context"
14+ "fmt"
1415 "time"
1516
1617 "github.com/hypermodeinc/modus/runtime/db"
@@ -33,6 +34,7 @@ func Initialize(ctx context.Context) {
3334
3435 opts := []goakt.Option {
3536 goakt .WithLogger (newActorLogger (logger .Get (ctx ))),
37+ goakt .WithCoordinatedShutdown (beforeShutdown ),
3638 goakt .WithPubSub (),
3739 goakt .WithActorInitTimeout (10 * time .Second ), // TODO: adjust this value, or make it configurable
3840 goakt .WithActorInitMaxRetries (1 ), // TODO: adjust this value, or make it configurable
@@ -50,6 +52,8 @@ func Initialize(ctx context.Context) {
5052 _actorSystem = actorSystem
5153 }
5254
55+ waitForClusterSync ()
56+
5357 logger .Info (ctx ).Msg ("Actor system started." )
5458
5559 pluginmanager .RegisterPluginLoadedCallback (loadAgentActors )
@@ -70,54 +74,76 @@ func loadAgentActors(ctx context.Context, plugin *plugins.Plugin) error {
7074
7175 // spawn actors for agents with state in the database, that are not already running
7276 // check both locally and on remote nodes in the cluster
77+ logger .Debug (ctx ).Msg ("Restoring agent actors from database." )
7378 agents , err := db .QueryActiveAgents (ctx )
7479 if err != nil {
75- logger .Err (ctx , err ).Msg ("Failed to query agents from database." )
76- return err
80+ return fmt .Errorf ("failed to query active agents: %w" , err )
7781 }
82+ inCluster := _actorSystem .InCluster ()
7883 for _ , agent := range agents {
7984 if ! localAgents [agent .Id ] {
80- if _actorSystem . InCluster () {
85+ if inCluster {
8186 actorName := getActorName (agent .Id )
82- if _ , err := _actorSystem .RemoteActor (ctx , actorName ); err == nil {
83- // found actor in cluster, no need to spawn it again
87+ if exists , err := _actorSystem .ActorExists (ctx , actorName ); err != nil {
88+ logger .Err (ctx , err ).Msgf ("Failed to check if actor %s exists in cluster." , actorName )
89+ } else if exists {
90+ // if the actor already exists in the cluster, skip spawning it
8491 continue
8592 }
8693 }
87- go func (f_ctx context.Context , pluginName , agentId , agentName string ) {
88- if err := spawnActorForAgent (f_ctx , pluginName , agentId , agentName , false ); err != nil {
89- logger .Err (f_ctx , err ).Msgf ("Failed to spawn actor for agent %s." , agentId )
90- }
91- }(ctx , plugin .Name (), agent .Id , agent .Name )
94+ if err := spawnActorForAgent (ctx , plugin .Name (), agent .Id , agent .Name , false ); err != nil {
95+ logger .Err (ctx , err ).Msgf ("Failed to spawn actor for agent %s." , agent .Id )
96+ }
9297 }
9398 }
9499
95100 return nil
96101}
97102
98- func beforeShutdown (ctx context.Context ) {
103+ func beforeShutdown (ctx context.Context ) error {
104+ _actorSystem .Logger ().(* actorLogger ).shuttingDown = true
99105 logger .Info (ctx ).Msg ("Actor system shutting down..." )
106+ actors := _actorSystem .Actors ()
100107
101- // stop all agent actors before shutdown so they can suspend properly
102- for _ , pid := range _actorSystem .Actors () {
103- if _ , ok := pid .Actor ().(* wasmAgentActor ); ok {
108+ // Suspend all local running agent actors first, which allows them to gracefully stop and persist their state.
109+ // In cluster mode, this will also allow the actor to resume on another node after this node shuts down.
110+ for _ , pid := range actors {
111+ if actor , ok := pid .Actor ().(* wasmAgentActor ); ok && pid .IsRunning () {
112+ if actor .status == AgentStatusRunning {
113+ ctx := actor .augmentContext (ctx , pid )
114+ if err := actor .suspendAgent (ctx ); err != nil {
115+ logger .Err (ctx , err ).Str ("agent_id" , actor .agentId ).Msg ("Failed to suspend agent actor." )
116+ }
117+ }
118+ }
119+ }
104120
105- // pass the pid so it can be used during shutdown as an event sender
106- ctx := context .WithValue (ctx , pidContextKey {}, pid )
121+ // Then shut down subscription actors. They will have received the suspend message already.
122+ for _ , pid := range actors {
123+ if _ , ok := pid .Actor ().(* subscriptionActor ); ok && pid .IsRunning () {
107124 if err := pid .Shutdown (ctx ); err != nil {
108125 logger .Err (ctx , err ).Msgf ("Failed to shutdown actor %s." , pid .Name ())
109126 }
110127 }
111128 }
129+
130+ waitForClusterSync ()
131+
132+ // then allow the actor system to continue with its shutdown process
133+ return nil
134+ }
135+
136+ func waitForClusterSync () {
137+ if clusterEnabled () {
138+ time .Sleep (peerSyncInterval () * 2 )
139+ }
112140}
113141
114142func Shutdown (ctx context.Context ) {
115143 if _actorSystem == nil {
116- return
144+ logger . Fatal ( ctx ). Msg ( "Actor system is not initialized, cannot shutdown." )
117145 }
118146
119- beforeShutdown (ctx )
120-
121147 if err := _actorSystem .Stop (ctx ); err != nil {
122148 logger .Err (ctx , err ).Msg ("Failed to shutdown actor system." )
123149 }
0 commit comments