@@ -55,27 +55,15 @@ func clusterOptions(ctx context.Context) []goakt.Option {
5555 logger .Fatal (ctx ).Err (err ).Msg ("Failed to create cluster discovery provider." )
5656 }
5757
58- var remotingHost string
59- if app .IsDevEnvironment () {
60- // only bind to localhost in development
61- remotingHost = "127.0.0.1"
62- } else {
63- // otherwise bind to all interfaces
64- remotingHost = "0.0.0.0"
65- }
66-
67- readTimeout := getDurationFromEnv ("MODUS_CLUSTER_READ_TIMEOUT_SECONDS" , 2 , time .Second )
68- writeTimeout := getDurationFromEnv ("MODUS_CLUSTER_WRITE_TIMEOUT_SECONDS" , 2 , time .Second )
69-
7058 return []goakt.Option {
71- goakt .WithRemote (remote .NewConfig (remotingHost , remotingPort )),
59+ goakt .WithRemote (remote .NewConfig (remotingHost () , remotingPort )),
7260 goakt .WithCluster (goakt .NewClusterConfig ().
7361 WithDiscovery (disco ).
7462 WithDiscoveryPort (discoveryPort ).
7563 WithPeersPort (peersPort ).
76- WithReadTimeout (readTimeout ).
77- WithWriteTimeout (writeTimeout ).
78- // WithPartitionCount(3 ).
64+ WithReadTimeout (readTimeout () ).
65+ WithWriteTimeout (writeTimeout () ).
66+ WithPartitionCount (partitionCount () ).
7967 WithClusterStateSyncInterval (nodesSyncInterval ()).
8068 WithPeersStateSyncInterval (peerSyncInterval ()).
8169 WithKinds (& wasmAgentActor {}, & subscriptionActor {}),
@@ -162,6 +150,18 @@ func clusterHost() string {
162150 }
163151}
164152
153+ // remotingHost returns the host address to bind the remoting system to.
154+ func remotingHost () string {
155+ // only bind to localhost in development
156+ if app .IsDevEnvironment () {
157+ return "127.0.0.1"
158+ }
159+
160+ // otherwise bind to all interfaces
161+ return "0.0.0.0"
162+ }
163+
164+ // clusterPorts returns the ports used for discovery, remoting, and peer communication in the cluster.
165165func clusterPorts () (discoveryPort , remotingPort , peersPort int ) {
166166
167167 // Get default ports dynamically, but use environment variables if set
@@ -173,19 +173,47 @@ func clusterPorts() (discoveryPort, remotingPort, peersPort int) {
173173 return
174174}
175175
176- // peerSyncInterval returns the interval at which the cluster peers sync their list of actors across the cluster.
176+ // peerSyncInterval returns the interval at which the actor system will sync its list of actors to other nodes across the cluster.
177177// We use a tight sync interval of 1 second by default, to ensure quick peer discovery as agents are added or removed.
178+ //
179+ // This value is also used for a sleep both on system startup and when spawning a new agent actor,
180+ // so it needs to be low enough to not be noticed by the user.
178181func peerSyncInterval () time.Duration {
179182 return getDurationFromEnv ("MODUS_CLUSTER_PEER_SYNC_SECONDS" , 1 , time .Second )
180183}
181184
182- // nodesSyncInterval returns the interval at which the cluster syncs the list of active nodes across the cluster.
183- // On each interval, discovery will be triggered to find new nodes and update the cluster state.
185+ // nodesSyncInterval returns the interval at which the cluster forces a resync of the list of active nodes across the cluster.
186+ // This matters only with regard to nodes going down unexpectedly, as other nodes in the cluster will not be aware of the change until the next sync.
187+ // It does not affect anything if a node is gracefully shut down, as that will be communicated immediately during the shutdown process.
188+ //
189+ // On each interval, the node will sync its list of nodes with the cluster, and update its local state accordingly.
184190// The default is 10 seconds, which is a reasonable balance between responsiveness and network overhead.
185191func nodesSyncInterval () time.Duration {
186192 return getDurationFromEnv ("MODUS_CLUSTER_NODES_SYNC_SECONDS" , 10 , time .Second )
187193}
188194
195+ // partitionCount returns the number of partitions the cluster will use for actor distribution.
196+ // It must be a prime number to work properly with the actor system's hashing algorithm.
197+ // It must be greater than the number of nodes in the cluster, but not too large to avoid excessive overhead.
198+ // In testing, 23 is the highest that works well with the other default timing constraints.
199+ // We'll use a slightly lower default of 13, which is still a prime number and should work well for most clusters.
200+ // The GoAkt default is 271, but this has been found to lead to other errors in practice.
201+ func partitionCount () uint64 {
202+ return uint64 (getIntFromEnv ("MODUS_CLUSTER_PARTITION_COUNT" , 13 ))
203+ }
204+
205+ // readTimeout returns the duration to wait for a cluster read operation before timing out.
206+ // The default is 1 second, which should usually not need to be changed.
207+ func readTimeout () time.Duration {
208+ return getDurationFromEnv ("MODUS_CLUSTER_READ_TIMEOUT_SECONDS" , 1 , time .Second )
209+ }
210+
211+ // writeTimeout returns the duration to wait for a cluster write operation before timing out.
212+ // The default is 1 second, which should usually not need to be changed.
213+ func writeTimeout () time.Duration {
214+ return getDurationFromEnv ("MODUS_CLUSTER_WRITE_TIMEOUT_SECONDS" , 1 , time .Second )
215+ }
216+
189217func getPodLabels () map [string ]string {
190218 // example value: "app.kubernetes.io/name=modus,app.kubernetes.io/component=runtime"
191219 if labels := os .Getenv ("MODUS_CLUSTER_POD_LABELS" ); labels != "" {
0 commit comments