Skip to content

Commit 1631fb4

Browse files
committed
some work
1 parent 37ff5bc commit 1631fb4

File tree

1 file changed

+100
-24
lines changed
  • docker/docker_jit_monitor/src

1 file changed

+100
-24
lines changed

docker/docker_jit_monitor/src/main.rs

Lines changed: 100 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
use serde::Deserialize;
12
use serde_json::Value;
23
use std::{
3-
process::{self, Command},
4+
process::{self, Child, Command, Output},
45
string::FromUtf8Error,
56
sync::atomic::{AtomicU32, AtomicU64, Ordering},
67
thread,
@@ -13,6 +14,7 @@ use clap::Parser;
1314
use log::{debug, error, info, warn};
1415
static RUNNER_ID: AtomicU64 = AtomicU64::new(0);
1516
static EXITING: AtomicU32 = AtomicU32::new(0);
17+
const MAX_SPAWN_RETRIES: u32 = 10;
1618

1719
#[derive(Parser, Debug)]
1820
#[clap(version)]
@@ -115,36 +117,44 @@ enum SpawnRunnerError {
115117
NoHdcDeviceFound,
116118
#[error("Failed to list USB devices")]
117119
LsUsbError,
120+
#[error("Failed to deserialize list runners api")]
121+
LIstRunnersDeserialize,
118122
}
119123

120-
// todo: add arg for optional device to pass into the runner
121-
fn spawn_runner(config: &RunnerConfig) -> Result<process::Child, SpawnRunnerError> {
122-
// Note: octocrab apparently requires more coarse grained tokens compared to `gh`, so we use `gh`.
124+
/// Function to call the api. Raw just is used spawnrunner.
125+
/// This gives you the _executed_ cmd.
126+
/// Note: octocrab apparently requires more coarse grained tokens compared
127+
/// to `gh`, so we use `gh`.
128+
fn call_runner_api(
129+
ci_scope: &str,
130+
method: &str,
131+
api_endpoint: &str,
132+
raw: Option<&RunnerConfig>,
133+
) -> Result<Output, SpawnRunnerError> {
123134
let mut cmd = Command::new("gh");
124-
let api_endpoint = format!(
125-
"{}/actions/runners/generate-jitconfig",
126-
config.servo_ci_scope
127-
);
135+
let api_endpoint = format!("{ci_scope}/actions/runners/{api_endpoint}");
128136
cmd.args([
129137
"api",
130138
"--method",
131-
"POST",
139+
method,
132140
"-H",
133141
"Accept: application/vnd.github+json",
134142
"-H",
135143
"X-GitHub-Api-Version: 2022-11-28",
136144
&api_endpoint,
137145
]);
138-
for label in &config.labels {
139-
cmd.arg("--raw-field").arg(format!("labels[]={label}"));
146+
if let Some(config) = raw {
147+
for label in &config.labels {
148+
cmd.arg("--raw-field").arg(format!("labels[]={label}"));
149+
}
150+
cmd.arg("--raw-field")
151+
// Todo: perhaps have a count here? Or add information if it has a device or not
152+
.arg(format!("name={}", config.name))
153+
.arg("--raw-field")
154+
.arg(format!("work_folder={}", config.work_folder))
155+
.arg("--field")
156+
.arg(format!("runner_group_id={}", config.runner_group_id));
140157
}
141-
cmd.arg("--raw-field")
142-
// Todo: perhaps have a count here? Or add information if it has a device or not
143-
.arg(format!("name={}", config.name))
144-
.arg("--raw-field")
145-
.arg(format!("work_folder={}", config.work_folder))
146-
.arg("--field")
147-
.arg(format!("runner_group_id={}", config.runner_group_id));
148158

149159
let output = cmd
150160
.output()
@@ -156,6 +166,17 @@ fn spawn_runner(config: &RunnerConfig) -> Result<process::Child, SpawnRunnerErro
156166
stderr,
157167
));
158168
}
169+
Ok(output)
170+
}
171+
172+
// todo: add arg for optional device to pass into the runner
173+
fn spawn_runner(config: &RunnerConfig) -> Result<process::Child, SpawnRunnerError> {
174+
let output = call_runner_api(
175+
&config.servo_ci_scope,
176+
"POST",
177+
"generate-jitconfig",
178+
Some(&config),
179+
)?;
159180

160181
let registration_info = String::from_utf8(output.stdout)?;
161182
let registration_info: Value = serde_json::from_str(&registration_info)?;
@@ -190,6 +211,38 @@ fn spawn_runner(config: &RunnerConfig) -> Result<process::Child, SpawnRunnerErro
190211
Ok(runner)
191212
}
192213

214+
#[derive(Debug, Deserialize)]
215+
struct ListRunnersResponse {
216+
id: u64,
217+
name: String,
218+
os: String,
219+
status: String,
220+
busy: bool,
221+
}
222+
223+
// Deregisters and kills runners that are offline according to gh api.
224+
fn kill_offline_runners(
225+
servo_ci_scope: &str,
226+
containers: &mut Vec<Child>,
227+
) -> Result<(), SpawnRunnerError> {
228+
let output = call_runner_api(&servo_ci_scope, "GET", "", None)?;
229+
let runner_response: Vec<ListRunnersResponse> = serde_json::from_slice(&output.stdout)?;
230+
231+
runner_response
232+
.iter()
233+
.filter(|runner| runner.name.contains("dresden-hos"))
234+
.filter(|runner| runner.status.contains("offline"));
235+
236+
for i in runner_response {
237+
call_runner_api(&servo_ci_scope, "DELETE", &i.id.to_string(), None)?;
238+
let cmd = Command::new("docker").arg()
239+
}
240+
241+
panic!("Kill docker containers");
242+
243+
Ok(())
244+
}
245+
193246
// Note: For now we assume linux x64. Compilation will fail on other platforms to remind us of that.
194247
#[cfg(target_os = "linux")]
195248
const OS_TAG: &str = "Linux";
@@ -219,34 +272,53 @@ fn main() -> anyhow::Result<()> {
219272
let mut running_hos_runners = vec![];
220273
// Todo: implement something to reserve devices for the duration of the docker run child process.
221274
const MAX_HOS_RUNNERS: usize = 1;
275+
let mut retries = 0;
222276

223277
loop {
224278
let exiting = EXITING.load(Ordering::Relaxed);
225279
if running_hos_builders.len() < args.concurrent_builders.into() && exiting == 0 {
226280
match spawn_runner(&RunnerConfig::new_hos_builder(&servo_ci_scope)) {
227-
Ok(child) => running_hos_builders.push(child),
228-
Err(SpawnRunnerError::GhApiError(_, message)) if message.contains("gh: Already exists") => {
281+
Ok(child) => {
282+
retries = 0;
283+
running_hos_builders.push(child)
284+
}
285+
Err(SpawnRunnerError::GhApiError(_, message))
286+
if message.contains("gh: Already exists") =>
287+
{
229288
// Might happen if containers were not killed properly after a forced exit.
230289
info!("Runner name already taken - Will retry with new name later.")
231290
}
232291
Err(e) => {
233292
error!("Failed to spawn JIT runner: {e:?}");
234293
thread::sleep(Duration::from_millis(500));
235-
// todo: abort if we retying likely wont solve the issue!
294+
retries += 1;
295+
if retries > MAX_SPAWN_RETRIES {
296+
println!("We had {retries} many times to spawn a runner/builder. It is not happening.");
297+
std::process::exit(-1);
298+
}
236299
}
237300
};
238301
}
239302
if running_hos_runners.len() < MAX_HOS_RUNNERS && exiting == 0 {
240303
match RunnerConfig::new_hos_runner(&servo_ci_scope).and_then(|cfg| spawn_runner(&cfg)) {
241-
Ok(child) => running_hos_runners.push(child),
242-
Err(SpawnRunnerError::GhApiError(_, message)) if message.contains("gh: Already exists") => {
304+
Ok(child) => {
305+
retries = 0;
306+
running_hos_runners.push(child)
307+
}
308+
Err(SpawnRunnerError::GhApiError(_, message))
309+
if message.contains("gh: Already exists") =>
310+
{
243311
// Might happen if containers were not killed properly after a forced exit.
244312
info!("Runner name already taken - Will retry with new name later.")
245313
}
246314
Err(e) => {
247315
error!("Failed to spawn JIT runner with HOS device: {e:?}");
248316
thread::sleep(Duration::from_millis(500));
249-
// todo: abort if we retying likely wont solve the issue!
317+
retries += 1;
318+
if retries > MAX_SPAWN_RETRIES {
319+
println!("We had {retries} many times to spawn a runner/builder. It is not happening.");
320+
std::process::exit(-1);
321+
}
250322
}
251323
};
252324
}
@@ -278,6 +350,10 @@ fn main() -> anyhow::Result<()> {
278350
}
279351
}
280352
}
353+
354+
// Check if some still running images are listed as offline from github api point of view
355+
kill_offline_runners(servo_ci_scope, &mut still_running);
356+
281357
running_hos_runners = still_running;
282358

283359
if running_hos_builders.is_empty()

0 commit comments

Comments
 (0)