@@ -15,6 +15,12 @@ use log::{debug, error, info, warn};
1515static RUNNER_ID : AtomicU64 = AtomicU64 :: new ( 0 ) ;
1616static EXITING : AtomicU32 = AtomicU32 :: new ( 0 ) ;
1717const MAX_SPAWN_RETRIES : u32 = 10 ;
18+ /// The final builder name will be {BUILDER_NAME}.{RUNNER_SUFFIX_ENV}.{RUNNER_ID}, same for RUNNER
19+ const BUILDER_NAME : & str = "dresden-hos-builder" ;
20+ const RUNNER_NAME : & str = "dresden-hos-runner" ;
21+ const RUNNER_SUFFIX_ENV : & str = "RUNNER_SUFFIX" ;
22+ /// Timeout for when we start the checking loop again not spawn the api.
23+ const LOOP_TIMEOUT : u64 = 30 ;
1824
1925#[ derive( Parser , Debug ) ]
2026#[ clap( version) ]
@@ -45,8 +51,9 @@ impl RunnerConfig {
4551 RunnerConfig {
4652 servo_ci_scope : servo_ci_scope. to_string ( ) ,
4753 name : format ! (
48- "dresden-hos-builder.{}.{}" ,
49- std:: env:: var( "RUNNER_SUFFIX" ) . unwrap_or_default( ) ,
54+ "{}.{}.{}" ,
55+ BUILDER_NAME ,
56+ std:: env:: var( RUNNER_SUFFIX_ENV ) . unwrap_or_default( ) ,
5057 RUNNER_ID . fetch_add( 1 , Ordering :: Relaxed ) ,
5158 ) ,
5259 runner_group_id : 1 ,
@@ -88,8 +95,9 @@ impl RunnerConfig {
8895 Ok ( RunnerConfig {
8996 servo_ci_scope : servo_ci_scope. to_string ( ) ,
9097 name : format ! (
91- "dresden-hos-runner.{}.{}" ,
92- std:: env:: var( "RUNNER_SUFFIX" ) . unwrap_or_default( ) ,
98+ "{}.{}.{}" ,
99+ RUNNER_NAME ,
100+ std:: env:: var( RUNNER_SUFFIX_ENV ) . unwrap_or_default( ) ,
93101 RUNNER_ID . fetch_add( 1 , Ordering :: Relaxed )
94102 ) ,
95103 runner_group_id : 1 ,
@@ -152,7 +160,7 @@ fn call_github_runner_api(
152160 cmd. arg ( "--raw-field" ) . arg ( format ! ( "labels[]={label}" ) ) ;
153161 }
154162 cmd. arg ( "--raw-field" )
155- // Todo: perhaps have a count here? Or add information if it has a device or not
163+ // Todo: perhaps add information if it has a device or not
156164 . arg ( format ! ( "name={}" , config. name) )
157165 . arg ( "--raw-field" )
158166 . arg ( format ! ( "work_folder={}" , config. work_folder) )
@@ -242,8 +250,20 @@ fn kill_offline_runners(servo_ci_scope: &str) -> Result<(), SpawnRunnerError> {
242250 let filtered_response = runner_response
243251 . runners
244252 . iter ( )
245- . filter ( |runner| runner. name . contains ( "dresden-hos" ) )
246- . filter ( |runner| runner. status . contains ( "offline" ) ) ;
253+ . filter ( |runner| runner. status . contains ( "offline" ) )
254+ . filter ( |runner| {
255+ runner. name . contains (
256+ format ! (
257+ "{}.{}" ,
258+ RUNNER_NAME ,
259+ std:: env:: var( RUNNER_SUFFIX_ENV ) . unwrap_or_default( )
260+ ) || runner. name . contains ( format ! (
261+ "{}.{}" ,
262+ BUILDER_NAME ,
263+ std:: env:: var( RUNNER_SUFFIX_ENV ) . unwrap_or_default( )
264+ ) ) ,
265+ )
266+ } ) ;
247267
248268 for i in filtered_response {
249269 info ! (
@@ -297,45 +317,46 @@ fn main() -> anyhow::Result<()> {
297317 let mut running_hos_runners = vec ! [ ] ;
298318 // Todo: implement something to reserve devices for the duration of the docker run child process.
299319 const MAX_HOS_RUNNERS : usize = 1 ;
300- let mut retries = 0 ;
320+ let mut retries_builder = 0 ;
321+ let mut retries_runner = 0 ;
301322
302323 loop {
303324 let exiting = EXITING . load ( Ordering :: Relaxed ) ;
304325 if running_hos_builders. len ( ) < args. concurrent_builders . into ( ) && exiting == 0 {
305326 match spawn_runner ( & RunnerConfig :: new_hos_builder ( & servo_ci_scope) ) {
306327 Ok ( child) => {
307- retries = 0 ;
328+ retries_builder = 0 ;
308329 running_hos_builders. push ( child)
309330 }
310331 Err ( SpawnRunnerError :: GhApiError ( _, message) )
311332 if message. contains ( "gh: Already exists" ) =>
312333 {
313334 // Might happen if containers were not killed properly after a forced exit.
314- info ! ( "Runner name already taken - Will retry with new name later." ) ;
315- check_and_inc_retries ( & mut retries ) ;
335+ info ! ( "Runner name already taken - Will retry with new name later." ) ; oh
336+ check_and_inc_retries ( & mut retries_builder ) ;
316337 }
317338 Err ( e) => {
318339 error ! ( "Failed to spawn JIT runner: {e:?}" ) ;
319- check_and_inc_retries ( & mut retries ) ;
340+ check_and_inc_retries ( & mut retries_builder ) ;
320341 }
321342 } ;
322343 }
323344 if running_hos_runners. len ( ) < MAX_HOS_RUNNERS && exiting == 0 {
324345 match RunnerConfig :: new_hos_runner ( & servo_ci_scope) . and_then ( |cfg| spawn_runner ( & cfg) ) {
325346 Ok ( child) => {
326- retries = 0 ;
347+ retries_runner = 0 ;
327348 running_hos_runners. push ( child)
328349 }
329350 Err ( SpawnRunnerError :: GhApiError ( _, message) )
330351 if message. contains ( "gh: Already exists" ) =>
331352 {
332353 // Might happen if containers were not killed properly after a forced exit.
333354 info ! ( "Runner name already taken - Will retry with new name later." ) ;
334- check_and_inc_retries ( & mut retries ) ;
355+ check_and_inc_retries ( & mut retries_runner ) ;
335356 }
336357 Err ( e) => {
337358 error ! ( "Failed to spawn JIT runner with HOS device: {e:?}" ) ;
338- check_and_inc_retries ( & mut retries ) ;
359+ check_and_inc_retries ( & mut retries_runner ) ;
339360 }
340361 } ;
341362 }
@@ -392,7 +413,7 @@ fn main() -> anyhow::Result<()> {
392413 thread:: sleep ( Duration :: from_millis ( 500 ) ) ;
393414 }
394415
395- thread:: sleep ( Duration :: from_secs ( 5 ) ) ;
416+ thread:: sleep ( Duration :: from_secs ( LOOP_TIMEOUT ) ) ;
396417 // Check if some still running images are listed as offline from github api point of view
397418 if let Err ( e) = kill_offline_runners ( & servo_ci_scope) {
398419 error ! ( "Killing offline runners failed with {e:?}" ) ;
0 commit comments