@@ -19,6 +19,7 @@ import (
1919 "github.com/hypermodeinc/modus/runtime/messages"
2020 "github.com/hypermodeinc/modus/runtime/pluginmanager"
2121 "github.com/hypermodeinc/modus/runtime/plugins"
22+ "github.com/hypermodeinc/modus/runtime/utils"
2223 "github.com/hypermodeinc/modus/runtime/wasmhost"
2324
2425 goakt "github.com/tochemey/goakt/v3/actor"
@@ -27,6 +28,8 @@ import (
2728var _actorSystem goakt.ActorSystem
2829
2930func Initialize (ctx context.Context ) {
31+ span , ctx := utils .NewSentrySpanForCurrentFunc (ctx )
32+ defer span .Finish ()
3033
3134 wasmExt := & wasmExtension {
3235 host : wasmhost .GetWasmHost (ctx ),
@@ -42,24 +45,47 @@ func Initialize(ctx context.Context) {
4245 }
4346 opts = append (opts , clusterOptions (ctx )... )
4447
45- if actorSystem , err := goakt .NewActorSystem ("modus" , opts ... ); err != nil {
48+ actorSystem , err := goakt .NewActorSystem ("modus" , opts ... )
49+ if err != nil {
4650 logger .Fatal (ctx ).Err (err ).Msg ("Failed to create actor system." )
47- } else if err := actorSystem .Start (ctx ); err != nil {
51+ }
52+
53+ if err := startActorSystem (ctx , actorSystem ); err != nil {
4854 logger .Fatal (ctx ).Err (err ).Msg ("Failed to start actor system." )
49- } else if err := actorSystem .Inject (& wasmAgentInfo {}); err != nil {
55+ }
56+
57+ if err := actorSystem .Inject (& wasmAgentInfo {}); err != nil {
5058 logger .Fatal (ctx ).Err (err ).Msg ("Failed to inject wasm agent info into actor system." )
51- } else {
52- _actorSystem = actorSystem
5359 }
5460
55- waitForClusterSync ()
61+ _actorSystem = actorSystem
5662
5763 logger .Info (ctx ).Msg ("Actor system started." )
5864
5965 pluginmanager .RegisterPluginLoadedCallback (loadAgentActors )
6066}
6167
68+ func startActorSystem (ctx context.Context , actorSystem goakt.ActorSystem ) error {
69+ maxRetries := getIntFromEnv ("MODUS_ACTOR_SYSTEM_START_MAX_RETRIES" , 5 )
70+ retryInterval := getDurationFromEnv ("MODUS_ACTOR_SYSTEM_START_RETRY_INTERVAL_SECONDS" , 2 , time .Second )
71+
72+ for i := range maxRetries {
73+ if err := actorSystem .Start (ctx ); err != nil {
74+ logger .Warn (ctx ).Err (err ).Int ("attempt" , i + 1 ).Msgf ("Failed to start actor system, retrying in %s..." , retryInterval )
75+ time .Sleep (retryInterval )
76+ retryInterval *= 2 // Exponential backoff
77+ continue
78+ }
79+ return nil
80+ }
81+
82+ return fmt .Errorf ("failed to start actor system after %d retries" , maxRetries )
83+ }
84+
6285func loadAgentActors (ctx context.Context , plugin * plugins.Plugin ) error {
86+ span , ctx := utils .NewSentrySpanForCurrentFunc (ctx )
87+ defer span .Finish ()
88+
6389 // restart local actors that are already running, which will reload the plugin
6490 actors := _actorSystem .Actors ()
6591 localAgents := make (map [string ]bool , len (actors ))
@@ -72,36 +98,44 @@ func loadAgentActors(ctx context.Context, plugin *plugins.Plugin) error {
7298 }
7399 }
74100
75- // spawn actors for agents with state in the database, that are not already running
76- // check both locally and on remote nodes in the cluster
77- logger .Debug (ctx ).Msg ("Restoring agent actors from database." )
78- agents , err := db .QueryActiveAgents (ctx )
79- if err != nil {
80- return fmt .Errorf ("failed to query active agents: %w" , err )
81- }
82- inCluster := _actorSystem .InCluster ()
83- for _ , agent := range agents {
84- if ! localAgents [agent .Id ] {
85- if inCluster {
86- actorName := getActorName (agent .Id )
87- if exists , err := _actorSystem .ActorExists (ctx , actorName ); err != nil {
88- logger .Err (ctx , err ).Msgf ("Failed to check if actor %s exists in cluster." , actorName )
89- } else if exists {
90- // if the actor already exists in the cluster, skip spawning it
91- continue
101+ // do this next part in a goroutine to avoid blocking the cluster engine startup
102+ go func () {
103+ waitForClusterSync ()
104+
105+ // spawn actors for agents with state in the database, that are not already running
106+ // check both locally and on remote nodes in the cluster
107+ logger .Debug (ctx ).Msg ("Restoring agent actors from database." )
108+ agents , err := db .QueryActiveAgents (ctx )
109+ if err != nil {
110+ logger .Err (ctx , err ).Msg ("Failed to query active agents from database." )
111+ return
112+ }
113+ inCluster := _actorSystem .InCluster ()
114+ for _ , agent := range agents {
115+ if ! localAgents [agent .Id ] {
116+ if inCluster {
117+ actorName := getActorName (agent .Id )
118+ if exists , err := _actorSystem .ActorExists (ctx , actorName ); err != nil {
119+ logger .Err (ctx , err ).Msgf ("Failed to check if actor %s exists in cluster." , actorName )
120+ } else if exists {
121+ // if the actor already exists in the cluster, skip spawning it
122+ continue
123+ }
124+ }
125+ if err := spawnActorForAgent (ctx , plugin .Name (), agent .Id , agent .Name , false ); err != nil {
126+ logger .Err (ctx , err ).Msgf ("Failed to spawn actor for agent %s." , agent .Id )
92127 }
93- }
94- if err := spawnActorForAgent (ctx , plugin .Name (), agent .Id , agent .Name , false ); err != nil {
95- logger .Err (ctx , err ).Msgf ("Failed to spawn actor for agent %s." , agent .Id )
96128 }
97129 }
98- }
130+ }()
99131
100132 return nil
101133}
102134
103135func beforeShutdown (ctx context.Context ) error {
104- _actorSystem .Logger ().(* actorLogger ).shuttingDown = true
136+ span , ctx := utils .NewSentrySpanForCurrentFunc (ctx )
137+ defer span .Finish ()
138+
105139 logger .Info (ctx ).Msg ("Actor system shutting down..." )
106140 actors := _actorSystem .Actors ()
107141
@@ -127,19 +161,22 @@ func beforeShutdown(ctx context.Context) error {
127161 }
128162 }
129163
130- waitForClusterSync ()
164+ // waitForClusterSync()
131165
132166 // then allow the actor system to continue with its shutdown process
133167 return nil
134168}
135169
136170func waitForClusterSync () {
137171 if clusterEnabled () {
138- time .Sleep (peerSyncInterval () * 2 )
172+ time .Sleep (nodesSyncInterval () )
139173 }
140174}
141175
142176func Shutdown (ctx context.Context ) {
177+ span , ctx := utils .NewSentrySpanForCurrentFunc (ctx )
178+ defer span .Finish ()
179+
143180 if _actorSystem == nil {
144181 logger .Fatal (ctx ).Msg ("Actor system is not initialized, cannot shutdown." )
145182 }
0 commit comments