Skip to content

Commit 0f4a90c

Browse files
committed
vmm: Auto restart exited VMs
1 parent 8d120e1 commit 0f4a90c

File tree

4 files changed

+60
-2
lines changed

4 files changed

+60
-2
lines changed

vmm/src/app.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use guest_api::client::DefaultClient as GuestClient;
1212
use id_pool::IdPool;
1313
use ra_rpc::client::RaClient;
1414
use serde::{Deserialize, Serialize};
15-
use std::collections::{HashMap, HashSet};
15+
use std::collections::{BTreeSet, HashMap, HashSet};
1616
use std::net::IpAddr;
1717
use std::path::{Path, PathBuf};
1818
use std::sync::{Arc, Mutex, MutexGuard};
@@ -658,6 +658,33 @@ impl App {
658658
})
659659
.collect())
660660
}
661+
662+
pub(crate) async fn try_restart_exited_vms(&self) -> Result<()> {
663+
let running_vms = self
664+
.supervisor
665+
.list()
666+
.await
667+
.context("Failed to list VMs")?
668+
.iter()
669+
.filter(|v| v.state.status.is_running())
670+
.map(|v| v.config.id.clone())
671+
.collect::<BTreeSet<_>>();
672+
let exited_vms = self
673+
.lock()
674+
.iter_vms()
675+
.filter(|vm| {
676+
let workdir = self.work_dir(&vm.config.manifest.id);
677+
let started = workdir.started().unwrap_or(false);
678+
started && !running_vms.contains(&vm.config.manifest.id)
679+
})
680+
.map(|vm| vm.config.manifest.id.clone())
681+
.collect::<Vec<_>>();
682+
for id in exited_vms {
683+
info!("Restarting VM {id}");
684+
self.start_vm(&id).await?;
685+
}
686+
Ok(())
687+
}
661688
}
662689

663690
fn paginate<T>(items: Vec<T>, page: u32, page_size: u32) -> impl Iterator<Item = T> {

vmm/src/config.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ pub struct PortMappingConfig {
6262
pub range: Vec<PortRange>,
6363
}
6464

65+
#[derive(Debug, Clone, Deserialize)]
66+
pub struct AutoRestartConfig {
67+
pub enabled: bool,
68+
pub interval: u64,
69+
}
70+
6571
impl PortMappingConfig {
6672
pub fn is_allowed(&self, protocol: &str, port: u16) -> bool {
6773
if !self.enabled {
@@ -111,6 +117,9 @@ pub struct CvmConfig {
111117
/// The tmp CA key
112118
#[serde(default)]
113119
pub tmp_ca_key: String,
120+
121+
/// Auto restart configuration
122+
pub auto_restart: AutoRestartConfig,
114123
}
115124

116125
#[derive(Debug, Clone, Deserialize)]

vmm/src/main.rs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::path::Path;
1+
use std::{path::Path, time::Duration};
22

33
use anyhow::{anyhow, Context, Result};
44
use app::App;
@@ -16,6 +16,7 @@ use rocket::{
1616
use rocket_apitoken::ApiToken;
1717
use rocket_vsock_listener::VsockListener;
1818
use supervisor_client::SupervisorClient;
19+
use tracing::{error, info};
1920

2021
mod app;
2122
mod config;
@@ -102,6 +103,22 @@ async fn run_host_api(app: App, figment: Figment) -> Result<()> {
102103
Ok(())
103104
}
104105

106+
async fn auto_restart_task(app: App) {
107+
if !app.config.cvm.auto_restart.enabled {
108+
info!("Auto restart CVMs is disabled");
109+
return;
110+
}
111+
let mut interval =
112+
tokio::time::interval(Duration::from_secs(app.config.cvm.auto_restart.interval));
113+
loop {
114+
info!("Checking for exited VMs");
115+
if let Err(err) = app.try_restart_exited_vms().await {
116+
error!("Failed to restart exited VMs: {err:?}");
117+
}
118+
interval.tick().await;
119+
}
120+
}
121+
105122
#[rocket::main]
106123
async fn main() -> Result<()> {
107124
{
@@ -130,6 +147,7 @@ async fn main() -> Result<()> {
130147
};
131148
let state = app::App::new(config, supervisor);
132149
state.reload_vms().await.context("Failed to reload VMs")?;
150+
tokio::spawn(auto_restart_task(state.clone()));
133151

134152
tokio::select! {
135153
result = run_external_api(state.clone(), figment.clone(), api_auth) => {

vmm/vmm.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ range = [
3838
{ protocol = "tcp", from = 1, to = 20000 },
3939
]
4040

41+
[cvm.auto_restart]
42+
enabled = true
43+
interval = 20
44+
4145
[cvm.gpu]
4246
enabled = false
4347
# The product IDs of the GPUs to discover

0 commit comments

Comments
 (0)