1+ use serde:: Deserialize ;
12use serde_json:: Value ;
23use std:: {
3- process:: { self , Command } ,
4+ process:: { self , Child , Command , Output } ,
45 string:: FromUtf8Error ,
56 sync:: atomic:: { AtomicU32 , AtomicU64 , Ordering } ,
67 thread,
@@ -13,6 +14,7 @@ use clap::Parser;
1314use log:: { debug, error, info, warn} ;
1415static RUNNER_ID : AtomicU64 = AtomicU64 :: new ( 0 ) ;
1516static EXITING : AtomicU32 = AtomicU32 :: new ( 0 ) ;
17+ const MAX_SPAWN_RETRIES : u32 = 10 ;
1618
1719#[ derive( Parser , Debug ) ]
1820#[ clap( version) ]
@@ -115,36 +117,44 @@ enum SpawnRunnerError {
115117 NoHdcDeviceFound ,
116118 #[ error( "Failed to list USB devices" ) ]
117119 LsUsbError ,
120+ #[ error( "Failed to deserialize list runners api" ) ]
121+ LIstRunnersDeserialize ,
118122}
119123
120- // todo: add arg for optional device to pass into the runner
121- fn spawn_runner ( config : & RunnerConfig ) -> Result < process:: Child , SpawnRunnerError > {
122- // Note: octocrab apparently requires more coarse grained tokens compared to `gh`, so we use `gh`.
124+ /// Function to call the api. Raw just is used spawnrunner.
125+ /// This gives you the _executed_ cmd.
126+ /// Note: octocrab apparently requires more coarse grained tokens compared
127+ /// to `gh`, so we use `gh`.
128+ fn call_runner_api (
129+ ci_scope : & str ,
130+ method : & str ,
131+ api_endpoint : & str ,
132+ raw : Option < & RunnerConfig > ,
133+ ) -> Result < Output , SpawnRunnerError > {
123134 let mut cmd = Command :: new ( "gh" ) ;
124- let api_endpoint = format ! (
125- "{}/actions/runners/generate-jitconfig" ,
126- config. servo_ci_scope
127- ) ;
135+ let api_endpoint = format ! ( "{ci_scope}/actions/runners/{api_endpoint}" ) ;
128136 cmd. args ( [
129137 "api" ,
130138 "--method" ,
131- "POST" ,
139+ method ,
132140 "-H" ,
133141 "Accept: application/vnd.github+json" ,
134142 "-H" ,
135143 "X-GitHub-Api-Version: 2022-11-28" ,
136144 & api_endpoint,
137145 ] ) ;
138- for label in & config. labels {
139- cmd. arg ( "--raw-field" ) . arg ( format ! ( "labels[]={label}" ) ) ;
146+ if let Some ( config) = raw {
147+ for label in & config. labels {
148+ cmd. arg ( "--raw-field" ) . arg ( format ! ( "labels[]={label}" ) ) ;
149+ }
150+ cmd. arg ( "--raw-field" )
151+ // Todo: perhaps have a count here? Or add information if it has a device or not
152+ . arg ( format ! ( "name={}" , config. name) )
153+ . arg ( "--raw-field" )
154+ . arg ( format ! ( "work_folder={}" , config. work_folder) )
155+ . arg ( "--field" )
156+ . arg ( format ! ( "runner_group_id={}" , config. runner_group_id) ) ;
140157 }
141- cmd. arg ( "--raw-field" )
142- // Todo: perhaps have a count here? Or add information if it has a device or not
143- . arg ( format ! ( "name={}" , config. name) )
144- . arg ( "--raw-field" )
145- . arg ( format ! ( "work_folder={}" , config. work_folder) )
146- . arg ( "--field" )
147- . arg ( format ! ( "runner_group_id={}" , config. runner_group_id) ) ;
148158
149159 let output = cmd
150160 . output ( )
@@ -156,6 +166,17 @@ fn spawn_runner(config: &RunnerConfig) -> Result<process::Child, SpawnRunnerErro
156166 stderr,
157167 ) ) ;
158168 }
169+ Ok ( output)
170+ }
171+
172+ // todo: add arg for optional device to pass into the runner
173+ fn spawn_runner ( config : & RunnerConfig ) -> Result < process:: Child , SpawnRunnerError > {
174+ let output = call_runner_api (
175+ & config. servo_ci_scope ,
176+ "POST" ,
177+ "generate-jitconfig" ,
178+ Some ( & config) ,
179+ ) ?;
159180
160181 let registration_info = String :: from_utf8 ( output. stdout ) ?;
161182 let registration_info: Value = serde_json:: from_str ( & registration_info) ?;
@@ -190,6 +211,38 @@ fn spawn_runner(config: &RunnerConfig) -> Result<process::Child, SpawnRunnerErro
190211 Ok ( runner)
191212}
192213
214+ #[ derive( Debug , Deserialize ) ]
215+ struct ListRunnersResponse {
216+ id : u64 ,
217+ name : String ,
218+ os : String ,
219+ status : String ,
220+ busy : bool ,
221+ }
222+
223+ // Deregisters and kills runners that are offline according to gh api.
224+ fn kill_offline_runners (
225+ servo_ci_scope : & str ,
226+ containers : & mut Vec < Child > ,
227+ ) -> Result < ( ) , SpawnRunnerError > {
228+ let output = call_runner_api ( & servo_ci_scope, "GET" , "" , None ) ?;
229+ let runner_response: Vec < ListRunnersResponse > = serde_json:: from_slice ( & output. stdout ) ?;
230+
231+ runner_response
232+ . iter ( )
233+ . filter ( |runner| runner. name . contains ( "dresden-hos" ) )
234+ . filter ( |runner| runner. status . contains ( "offline" ) ) ;
235+
236+ for i in runner_response {
237+ call_runner_api ( & servo_ci_scope, "DELETE" , & i. id . to_string ( ) , None ) ?;
238+ let cmd = Command :: new ( "docker" ) . arg ( )
239+ }
240+
241+ panic ! ( "Kill docker containers" ) ;
242+
243+ Ok ( ( ) )
244+ }
245+
193246// Note: For now we assume linux x64. Compilation will fail on other platforms to remind us of that.
194247#[ cfg( target_os = "linux" ) ]
195248const OS_TAG : & str = "Linux" ;
@@ -219,34 +272,53 @@ fn main() -> anyhow::Result<()> {
219272 let mut running_hos_runners = vec ! [ ] ;
220273 // Todo: implement something to reserve devices for the duration of the docker run child process.
221274 const MAX_HOS_RUNNERS : usize = 1 ;
275+ let mut retries = 0 ;
222276
223277 loop {
224278 let exiting = EXITING . load ( Ordering :: Relaxed ) ;
225279 if running_hos_builders. len ( ) < args. concurrent_builders . into ( ) && exiting == 0 {
226280 match spawn_runner ( & RunnerConfig :: new_hos_builder ( & servo_ci_scope) ) {
227- Ok ( child) => running_hos_builders. push ( child) ,
228- Err ( SpawnRunnerError :: GhApiError ( _, message) ) if message. contains ( "gh: Already exists" ) => {
281+ Ok ( child) => {
282+ retries = 0 ;
283+ running_hos_builders. push ( child)
284+ }
285+ Err ( SpawnRunnerError :: GhApiError ( _, message) )
286+ if message. contains ( "gh: Already exists" ) =>
287+ {
229288 // Might happen if containers were not killed properly after a forced exit.
230289 info ! ( "Runner name already taken - Will retry with new name later." )
231290 }
232291 Err ( e) => {
233292 error ! ( "Failed to spawn JIT runner: {e:?}" ) ;
234293 thread:: sleep ( Duration :: from_millis ( 500 ) ) ;
235- // todo: abort if we retying likely wont solve the issue!
294+ retries += 1 ;
295+ if retries > MAX_SPAWN_RETRIES {
296+ println ! ( "We had {retries} many times to spawn a runner/builder. It is not happening." ) ;
297+ std:: process:: exit ( -1 ) ;
298+ }
236299 }
237300 } ;
238301 }
239302 if running_hos_runners. len ( ) < MAX_HOS_RUNNERS && exiting == 0 {
240303 match RunnerConfig :: new_hos_runner ( & servo_ci_scope) . and_then ( |cfg| spawn_runner ( & cfg) ) {
241- Ok ( child) => running_hos_runners. push ( child) ,
242- Err ( SpawnRunnerError :: GhApiError ( _, message) ) if message. contains ( "gh: Already exists" ) => {
304+ Ok ( child) => {
305+ retries = 0 ;
306+ running_hos_runners. push ( child)
307+ }
308+ Err ( SpawnRunnerError :: GhApiError ( _, message) )
309+ if message. contains ( "gh: Already exists" ) =>
310+ {
243311 // Might happen if containers were not killed properly after a forced exit.
244312 info ! ( "Runner name already taken - Will retry with new name later." )
245313 }
246314 Err ( e) => {
247315 error ! ( "Failed to spawn JIT runner with HOS device: {e:?}" ) ;
248316 thread:: sleep ( Duration :: from_millis ( 500 ) ) ;
249- // todo: abort if we retying likely wont solve the issue!
317+ retries += 1 ;
318+ if retries > MAX_SPAWN_RETRIES {
319+ println ! ( "We had {retries} many times to spawn a runner/builder. It is not happening." ) ;
320+ std:: process:: exit ( -1 ) ;
321+ }
250322 }
251323 } ;
252324 }
@@ -278,6 +350,10 @@ fn main() -> anyhow::Result<()> {
278350 }
279351 }
280352 }
353+
354+ // Check if some still running images are listed as offline from github api point of view
355+ kill_offline_runners ( servo_ci_scope, & mut still_running) ;
356+
281357 running_hos_runners = still_running;
282358
283359 if running_hos_builders. is_empty ( )
0 commit comments