adapt the refresh mechanism to consider the replication_factor

lukasz-zet · lukasz-zet · commit 8a11af1c7258 · 2025-12-10T04:07:03.000-06:00
diff --git a/edgeless_orc/src/active_instance.rs b/edgeless_orc/src/active_instance.rs
@@ -32,7 +32,7 @@ impl ActiveInstance {
     pub fn instance_ids_mut(&mut self) -> &mut Vec<(edgeless_api::function_instance::InstanceId, bool)> {
         match self {
             Self::Function(_, ids) => ids,
-            Self::Resource(_, id) => {
+            Self::Resource(_, _) => {
                 panic!("Cannot get mutable reference to instance ids of a resource instance - there is always only one instance");
             }
         }
diff --git a/edgeless_orc/src/orchestrator_task.rs b/edgeless_orc/src/orchestrator_task.rs
@@ -586,6 +586,7 @@ impl OrchestratorTask {
         // Finally try to spawn the function instance on the
         // selected client.
         // [TODO] Issue#96 We assume that one "active" instance is spawned per node.
+        // When replication_factor is specifid, we start more instances in standby mode.
         // Other instances, spawned on other nodes are considered "hot-standby" ones.
         match fn_client.start(spawn_req.clone()).await {
             Ok(res) => match res {
@@ -596,25 +597,19 @@ impl OrchestratorTask {
                     assert!(*node_id == id.node_id);
                     // if the lid is already present, append the new instance id to the list
                     if let Some(existing_instance) = self.active_instances.get_mut(lid) {
-                        existing_instance.instance_ids_mut().append(&mut vec![(
-                            edgeless_api::function_instance::InstanceId {
+                        existing_instance.instance_ids_mut().append(&mut vec![(edgeless_api::function_instance::InstanceId {
                                 node_id: *node_id,
                                 function_id: id.function_id,
-                            },
-                            false,
-                        )]); // not a hot-standby instance
+                        }, false)]); // not a hot-standby instance
                     } else {
                         self.active_instances.insert(
                             *lid,
                             crate::active_instance::ActiveInstance::Function(
                                 spawn_req.clone(),
-                                vec![(
-                                    edgeless_api::function_instance::InstanceId {
+                                vec![(edgeless_api::function_instance::InstanceId {
                                         node_id: *node_id,
                                         function_id: id.function_id,
-                                    },
-                                    true,
-                                )], // only the first started instance is "used" - the rest are hot standby
+                                }, true)], // only the first started instance is "used" - the rest are hot standby
                             ),
                         );
                     }
@@ -948,7 +943,16 @@ impl OrchestratorTask {
         //
         // Make sure that all active logical functions are assigned
         // to at least one instance: for all the function instances that
-        // were running on disconnected nodes, create new instances.
+        // were running on disconnected nodes, create new instances. If
+        // replication is enabled, make sure to gracefully failover.
+        // 
+        // Default behavior: (no replication_factor): every LID has exactly
+        // one physical function instance and failover can only be done
+        // after a new instance of the function has been started in the cluster.
+        // 
+        // KPI-13: If the replication_factor for a function is > 1, then we do a 
+        // graceful failover to a hot-standby function and then make sure
+        // that enough copies are still available in the cluster.
         //
 
         // List of LIDs that will have to be repatched
@@ -968,7 +972,8 @@ impl OrchestratorTask {
         // value: resource specs
         let mut res_to_be_created = std::collections::HashMap::new();
 
-        // List of lid that will have to be repatched.
+        // List of lid that will have to be repatched. These are active
+        // instances where at least one more instance is running.
         let mut active_instances_to_be_updated = vec![];
 
         // Find all the functions/resources affected.
@@ -982,6 +987,23 @@ impl OrchestratorTask {
                 crate::active_instance::ActiveInstance::Function(start_req, instances) => {
                     let num_disconnected = instances.iter().filter(|x| !self.nodes.contains_key(&x.0.node_id)).count();
                     assert!(num_disconnected <= instances.len());
+                    if let Some(replicas) = start_req.replication_factor {
+                        // KPI-13 hot redundancy mechanism
+                        if num_disconnected == replicas as usize {
+                            log::warn!("All function replicas have died - graceful failover not possible, we need to start at least one instance first");
+                        } else {
+                            // We need to check if the "active" replica (the one serving traffic) or one of the "hot-standby" replicas have been disconnected
+                            // If the "active replica died, we need to repatch as fast as possible due to KPI-13"
+                            // If one of the "hot-standby" replicas died, we simply start it on another node. In case there is no other node that can host it, we show an error message.
+                            let active_replica_died = instances.iter().filter(|x| !self.nodes.contains_key(&x.0.node_id) && x.1).count() == 1;
+                            if active_replica_died {
+                                log::info!("Graceful failover possible! (at least one hot-standby replica is available)");
+                            } else {
+                                log::info!("Active replica still works! Restarting a replica");
+                            }
+                        }
+                    } else {
+                        // Default mechanism
                     if instances.is_empty() || num_disconnected > 0 {
                         to_be_repatched.push(*origin_lid);
                         if instances.is_empty() || num_disconnected == instances.len() {
@@ -996,6 +1018,7 @@ impl OrchestratorTask {
                             // let the others still alive handle
                             // the logical function.
                             active_instances_to_be_updated.push(*origin_lid);
+                            }
                         }
                     }
                 }

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ impl ActiveInstance {`
`32`	`32`	`pub fn instance_ids_mut(&mut self) -> &mut Vec<(edgeless_api::function_instance::InstanceId, bool)> {`
`33`	`33`	`match self {`
`34`	`34`	`Self::Function(_, ids) => ids,`
`35`		`- Self::Resource(_, id) => {`
	`35`	`+ Self::Resource(_, _) => {`
`36`	`36`	`panic!("Cannot get mutable reference to instance ids of a resource instance - there is always only one instance");`
`37`	`37`	`}`
`38`	`38`	`}`