Skip to content

Commit d39eb27

Browse files
committed
server: job run time limit should ignore time spent queued
1 parent 562ba79 commit d39eb27

File tree

2 files changed

+42
-36
lines changed

2 files changed

+42
-36
lines changed

server/src/db/tables/job_event.rs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2023 Oxide Computer Company
2+
* Copyright 2025 Oxide Computer Company
33
*/
44

55
use super::sublude::*;
@@ -75,8 +75,4 @@ impl JobEvent {
7575
])
7676
.to_owned()
7777
}
78-
79-
pub fn age(&self) -> Duration {
80-
self.time.age()
81-
}
8278
}

server/src/workers.rs

Lines changed: 41 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2021 Oxide Computer Company
2+
* Copyright 2025 Oxide Computer Company
33
*/
44

55
use chrono::prelude::*;
@@ -87,40 +87,50 @@ async fn worker_cleanup_one(log: &Logger, c: &Central) -> Result<()> {
8787
continue;
8888
}
8989

90+
if j.worker.is_none() {
91+
/*
92+
* This job has not yet been assigned to a worker. We do not
93+
* want to cancel jobs that have merely been queued for a long
94+
* time.
95+
*/
96+
continue;
97+
}
98+
9099
/*
91-
* Determine when we assigned this job to a worker by looking at the
92-
* timestamp on the first control event.
100+
* Determine when we assigned this job to a worker:
93101
*/
94-
let control =
95-
c.db.job_events(j.id, 0, 10_000)?
96-
.iter()
97-
.find(|jev| jev.stream == "control")
98-
.cloned();
99-
if let Some(control) = control {
100-
if control.age().as_secs() > c.config.job.max_runtime {
101-
warn!(
102-
log,
103-
"job {} duration {} exceeds {} seconds; \
102+
let times = c.db.job_times(j.id)?;
103+
let Some(atime) = times.get("assigned") else {
104+
continue;
105+
};
106+
let age = Utc::now()
107+
.signed_duration_since(atime)
108+
.to_std()
109+
.unwrap_or_else(|_| std::time::Duration::from_secs(0));
110+
111+
if age.as_secs() > c.config.job.max_runtime {
112+
warn!(
113+
log,
114+
"job {} duration {} exceeds {} seconds; \
104115
recycling worker {}",
105-
j.id,
106-
control.age().as_secs(),
116+
j.id,
117+
age.as_secs(),
118+
c.config.job.max_runtime,
119+
w.id,
120+
);
121+
c.db.job_append_event(
122+
j.id,
123+
None,
124+
"control",
125+
Utc::now(),
126+
None,
127+
&format!(
128+
"job duration {} exceeds {} seconds; aborting",
129+
age.as_secs(),
107130
c.config.job.max_runtime,
108-
w.id,
109-
);
110-
c.db.job_append_event(
111-
j.id,
112-
None,
113-
"control",
114-
Utc::now(),
115-
None,
116-
&format!(
117-
"job duration {} exceeds {} seconds; aborting",
118-
control.age().as_secs(),
119-
c.config.job.max_runtime,
120-
),
121-
)?;
122-
c.db.worker_recycle(w.id)?;
123-
}
131+
),
132+
)?;
133+
c.db.worker_recycle(w.id)?;
124134
}
125135
}
126136
}

0 commit comments

Comments
 (0)