Skip to content

Commit 8a11af1

Browse files
committed
adapt the refresh mechanism to consider the replication_factor
1 parent 5e209b1 commit 8a11af1

File tree

2 files changed

+36
-13
lines changed

2 files changed

+36
-13
lines changed

edgeless_orc/src/active_instance.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ impl ActiveInstance {
3232
pub fn instance_ids_mut(&mut self) -> &mut Vec<(edgeless_api::function_instance::InstanceId, bool)> {
3333
match self {
3434
Self::Function(_, ids) => ids,
35-
Self::Resource(_, id) => {
35+
Self::Resource(_, _) => {
3636
panic!("Cannot get mutable reference to instance ids of a resource instance - there is always only one instance");
3737
}
3838
}

edgeless_orc/src/orchestrator_task.rs

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,7 @@ impl OrchestratorTask {
586586
// Finally try to spawn the function instance on the
587587
// selected client.
588588
// [TODO] Issue#96 We assume that one "active" instance is spawned per node.
589+
// When replication_factor is specifid, we start more instances in standby mode.
589590
// Other instances, spawned on other nodes are considered "hot-standby" ones.
590591
match fn_client.start(spawn_req.clone()).await {
591592
Ok(res) => match res {
@@ -596,25 +597,19 @@ impl OrchestratorTask {
596597
assert!(*node_id == id.node_id);
597598
// if the lid is already present, append the new instance id to the list
598599
if let Some(existing_instance) = self.active_instances.get_mut(lid) {
599-
existing_instance.instance_ids_mut().append(&mut vec![(
600-
edgeless_api::function_instance::InstanceId {
600+
existing_instance.instance_ids_mut().append(&mut vec![(edgeless_api::function_instance::InstanceId {
601601
node_id: *node_id,
602602
function_id: id.function_id,
603-
},
604-
false,
605-
)]); // not a hot-standby instance
603+
}, false)]); // not a hot-standby instance
606604
} else {
607605
self.active_instances.insert(
608606
*lid,
609607
crate::active_instance::ActiveInstance::Function(
610608
spawn_req.clone(),
611-
vec![(
612-
edgeless_api::function_instance::InstanceId {
609+
vec![(edgeless_api::function_instance::InstanceId {
613610
node_id: *node_id,
614611
function_id: id.function_id,
615-
},
616-
true,
617-
)], // only the first started instance is "used" - the rest are hot standby
612+
}, true)], // only the first started instance is "used" - the rest are hot standby
618613
),
619614
);
620615
}
@@ -948,7 +943,16 @@ impl OrchestratorTask {
948943
//
949944
// Make sure that all active logical functions are assigned
950945
// to at least one instance: for all the function instances that
951-
// were running on disconnected nodes, create new instances.
946+
// were running on disconnected nodes, create new instances. If
947+
// replication is enabled, make sure to gracefully failover.
948+
//
949+
// Default behavior: (no replication_factor): every LID has exactly
950+
// one physical function instance and failover can only be done
951+
// after a new instance of the function has been started in the cluster.
952+
//
953+
// KPI-13: If the replication_factor for a function is > 1, then we do a
954+
// graceful failover to a hot-standby function and then make sure
955+
// that enough copies are still available in the cluster.
952956
//
953957

954958
// List of LIDs that will have to be repatched
@@ -968,7 +972,8 @@ impl OrchestratorTask {
968972
// value: resource specs
969973
let mut res_to_be_created = std::collections::HashMap::new();
970974

971-
// List of lid that will have to be repatched.
975+
// List of lid that will have to be repatched. These are active
976+
// instances where at least one more instance is running.
972977
let mut active_instances_to_be_updated = vec![];
973978

974979
// Find all the functions/resources affected.
@@ -982,6 +987,23 @@ impl OrchestratorTask {
982987
crate::active_instance::ActiveInstance::Function(start_req, instances) => {
983988
let num_disconnected = instances.iter().filter(|x| !self.nodes.contains_key(&x.0.node_id)).count();
984989
assert!(num_disconnected <= instances.len());
990+
if let Some(replicas) = start_req.replication_factor {
991+
// KPI-13 hot redundancy mechanism
992+
if num_disconnected == replicas as usize {
993+
log::warn!("All function replicas have died - graceful failover not possible, we need to start at least one instance first");
994+
} else {
995+
// We need to check if the "active" replica (the one serving traffic) or one of the "hot-standby" replicas have been disconnected
996+
// If the "active replica died, we need to repatch as fast as possible due to KPI-13"
997+
// If one of the "hot-standby" replicas died, we simply start it on another node. In case there is no other node that can host it, we show an error message.
998+
let active_replica_died = instances.iter().filter(|x| !self.nodes.contains_key(&x.0.node_id) && x.1).count() == 1;
999+
if active_replica_died {
1000+
log::info!("Graceful failover possible! (at least one hot-standby replica is available)");
1001+
} else {
1002+
log::info!("Active replica still works! Restarting a replica");
1003+
}
1004+
}
1005+
} else {
1006+
// Default mechanism
9851007
if instances.is_empty() || num_disconnected > 0 {
9861008
to_be_repatched.push(*origin_lid);
9871009
if instances.is_empty() || num_disconnected == instances.len() {
@@ -996,6 +1018,7 @@ impl OrchestratorTask {
9961018
// let the others still alive handle
9971019
// the logical function.
9981020
active_instances_to_be_updated.push(*origin_lid);
1021+
}
9991022
}
10001023
}
10011024
}

0 commit comments

Comments
 (0)