23
23
from github_runner_manager .metrics import events as metric_events
24
24
from github_runner_manager .metrics import github as github_metrics
25
25
from github_runner_manager .metrics import runner as runner_metrics
26
- from github_runner_manager .metrics .reconcile import CLEANED_RUNNERS_TOTAL
27
26
from github_runner_manager .metrics .runner import RunnerMetrics
28
27
from github_runner_manager .openstack_cloud .constants import CREATE_SERVER_TIMEOUT
29
28
from github_runner_manager .platform .platform_provider import (
30
- DeleteRunnerBusyError ,
31
29
PlatformApiError ,
32
30
PlatformProvider ,
33
31
PlatformRunnerHealth ,
@@ -82,27 +80,33 @@ class RunnerInstance:
82
80
platform_state : PlatformRunnerState | None
83
81
cloud_state : CloudRunnerState
84
82
85
- def __init__ (
86
- self ,
83
+ @classmethod
84
+ def from_cloud_and_platform_health (
85
+ cls ,
87
86
cloud_instance : CloudRunnerInstance ,
88
87
platform_health_state : PlatformRunnerHealth | None ,
89
- ):
88
+ ) -> "RunnerInstance" :
90
89
"""Construct an instance.
91
90
92
91
Args:
93
92
cloud_instance: Information on the cloud instance.
94
93
platform_health_state: Health state in the platform provider.
94
+
95
+ Returns:
96
+ The RunnerInstance instantiated from cloud instance and platform state.
95
97
"""
96
- self .name = cloud_instance .name
97
- self .instance_id = cloud_instance .instance_id
98
- self .metadata = cloud_instance .metadata
99
- self .health = cloud_instance .health
100
- self .platform_state = (
101
- PlatformRunnerState .from_platform_health (platform_health_state )
102
- if platform_health_state is not None
103
- else None
98
+ return cls (
99
+ name = cloud_instance .name ,
100
+ instance_id = cloud_instance .instance_id ,
101
+ metadata = cloud_instance .metadata ,
102
+ health = cloud_instance .health ,
103
+ platform_state = (
104
+ PlatformRunnerState .from_platform_health (platform_health_state )
105
+ if platform_health_state is not None
106
+ else None
107
+ ),
108
+ cloud_state = cloud_instance .state ,
104
109
)
105
- self .cloud_state = cloud_instance .state
106
110
107
111
108
112
class RunnerManager :
@@ -183,7 +187,7 @@ def get_runners(self) -> tuple[RunnerInstance, ...]:
183
187
health_runners_map = {runner .identity .instance_id : runner for runner in runners_health }
184
188
for cloud_runner in cloud_runners :
185
189
if cloud_runner .instance_id not in health_runners_map :
186
- runner_instance = RunnerInstance (cloud_runner , None )
190
+ runner_instance = RunnerInstance . from_cloud_and_platform_health (cloud_runner , None )
187
191
runner_instance .health = HealthState .UNKNOWN
188
192
runner_instances .append (runner_instance )
189
193
continue
@@ -194,7 +198,9 @@ def get_runners(self) -> tuple[RunnerInstance, ...]:
194
198
cloud_runner .health = HealthState .HEALTHY
195
199
else :
196
200
cloud_runner .health = HealthState .UNHEALTHY
197
- runner_instance = RunnerInstance (cloud_runner , health_runner )
201
+ runner_instance = RunnerInstance .from_cloud_and_platform_health (
202
+ cloud_runner , health_runner
203
+ )
198
204
runner_instances .append (runner_instance )
199
205
return cast (tuple [RunnerInstance ], tuple (runner_instances ))
200
206
@@ -268,7 +274,9 @@ def _cleanup_resources(
268
274
)
269
275
cloud_runners = self ._cloud .get_runners ()
270
276
logger .info ("cleanup cloud_runners %s" , cloud_runners )
271
- runners_health_response = self ._platform .get_runners_health (cloud_runners )
277
+ runners_health_response = self ._platform .get_runners_health (
278
+ requested_runners = cloud_runners
279
+ )
272
280
logger .info ("cleanup health_response %s" , runners_health_response )
273
281
274
282
# Clean dangling resources in the cloud
@@ -295,7 +303,7 @@ def _cleanup_resources(
295
303
)
296
304
)
297
305
298
- if maximum_runners_to_delete :
306
+ if maximum_runners_to_delete is not None :
299
307
cloud_runners_to_delete .sort (
300
308
key = partial (_runner_deletion_sort_key , health_runners_map )
301
309
)
@@ -313,58 +321,70 @@ def _delete_cloud_runners(
313
321
runners_health : Sequence [PlatformRunnerHealth ],
314
322
delete_busy_runners : bool = False ,
315
323
) -> Iterable [runner_metrics .RunnerMetrics ]:
316
- """Delete runners in the platform ant the cloud.
324
+ """Delete runners in the platform and the cloud.
317
325
318
326
If delete_busy_runners is False, when the platform provider fails in deleting the
319
327
runner because it can be busy, will mean that that runner should not be deleted.
328
+
329
+ Runners without health information should not be deleted.
320
330
"""
321
- extracted_runner_metrics = []
322
- health_runners_map = {health .identity .instance_id : health for health in runners_health }
323
- for cloud_runner in cloud_runners :
324
- logging .info ("Trying to delete cloud_runner %s" , cloud_runner )
325
- runner_health = health_runners_map .get (cloud_runner .instance_id )
326
- if runner_health and runner_health .runner_in_platform :
327
- try :
328
- self ._platform .delete_runner (runner_health .identity )
329
- except DeleteRunnerBusyError :
330
- if not delete_busy_runners :
331
- logger .warning (
332
- "Skipping deletion as the runner is busy. %s" , cloud_runner .instance_id
333
- )
334
- continue
335
- logger .info ("Deleting busy runner: %s" , cloud_runner .instance_id )
336
- except PlatformApiError as exc :
337
- if not delete_busy_runners :
338
- logger .warning (
339
- "Failed to delete platform runner %s. %s. Skipping." ,
340
- cloud_runner .instance_id ,
341
- exc ,
342
- )
343
- continue
344
- logger .warning (
345
- "Deleting runner: %s after platform failure %s." ,
346
- cloud_runner .instance_id ,
347
- exc ,
348
- )
331
+ if not cloud_runners :
332
+ return []
349
333
350
- logging .info ("Delete runner in cloud: %s" , cloud_runner .instance_id )
351
- runner_metric = self ._cloud .delete_runner (cloud_runner .instance_id )
352
- CLEANED_RUNNERS_TOTAL .labels (self .manager_name ).inc (1 )
353
- if not runner_metric :
354
- logger .error ("No metrics returned after deleting %s" , cloud_runner .instance_id )
355
- else :
356
- extracted_runner_metrics .append (runner_metric )
357
- return extracted_runner_metrics
334
+ runner_identity_map = {
335
+ health_info .identity .instance_id : health_info .identity
336
+ for health_info in runners_health
337
+ }
338
+ platform_runner_ids_to_delete = [
339
+ # The runner_id cannot be None due to the if condition. the type system
340
+ # isn't able to catch that.
341
+ cast (str , runner_identity_map [runner .instance_id ].metadata .runner_id )
342
+ for runner in cloud_runners
343
+ if runner .instance_id in runner_identity_map
344
+ and runner_identity_map [runner .instance_id ].metadata .runner_id
345
+ ]
346
+ logger .info ("Deleting runners from platform: %s" , platform_runner_ids_to_delete )
347
+ deleted_runner_ids = self ._platform .delete_runners (
348
+ runner_ids = platform_runner_ids_to_delete
349
+ )
350
+ logger .info (
351
+ "Deleted runners from platform: %s (diff: %s)" ,
352
+ deleted_runner_ids ,
353
+ set (platform_runner_ids_to_delete ) - set (deleted_runner_ids ),
354
+ )
355
+
356
+ logger .info ("Cloud runners: %s" , cloud_runners )
357
+ cloud_vm_ids_to_delete = [
358
+ runner .instance_id
359
+ for runner in cloud_runners
360
+ # We can delete all VMs if delete_busy_runners is True
361
+ if delete_busy_runners
362
+ # We can delete the VM if no runner is associated with it
363
+ or not runner .metadata .runner_id
364
+ # We can delete the VM if it has been deleted from the Platform provider.
365
+ or runner .metadata .runner_id in deleted_runner_ids
366
+ ]
367
+ logger .info ("Extracting metrics from cloud VMs: %s" , cloud_vm_ids_to_delete )
368
+ extracted_metrics = self ._cloud .extract_metrics (instance_ids = cloud_vm_ids_to_delete )
369
+ logger .info ("Extracted metrics from cloud VMs: %s" , extracted_metrics )
370
+ logger .info ("Deleting VMs %s" , cloud_vm_ids_to_delete )
371
+ deleted_vm_ids = self ._cloud .delete_vms (instance_ids = cloud_vm_ids_to_delete )
372
+ logger .info (
373
+ "Deleted VMs: %s, (diff: %s)" ,
374
+ deleted_vm_ids ,
375
+ set (cloud_vm_ids_to_delete ) - set (deleted_vm_ids ),
376
+ )
377
+ return tuple (extracted_metrics )
358
378
359
379
def _clean_platform_runners (self , runners : list [RunnerIdentity ]) -> None :
360
380
"""Clean the specified runners in the platform."""
361
- for runner in runners :
362
- try :
363
- self . _platform . delete_runner ( runner )
364
- except DeleteRunnerBusyError :
365
- logger . warning ( "Tried to delete busy runner in cleanup %s" , runner )
366
- except PlatformApiError :
367
- logger . warning ( "Failed to delete platform runner %s" , runner )
381
+ if not runners :
382
+ return
383
+
384
+ runner_ids_to_delete = [
385
+ runner . metadata . runner_id for runner in runners if runner . metadata . runner_id
386
+ ]
387
+ self . _platform . delete_runners ( runner_ids = runner_ids_to_delete )
368
388
369
389
@staticmethod
370
390
def _spawn_runners (
@@ -525,7 +545,7 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceID:
525
545
)
526
546
except RunnerError :
527
547
logger .warning ("Deleting runner %s from platform after creation failed" , instance_id )
528
- args .platform_provider .delete_runner ( runner_info . identity )
548
+ args .platform_provider .delete_runners ( runner_ids = [ args . metadata . runner_id ] )
529
549
raise
530
550
return instance_id
531
551
0 commit comments