Skip to content

Commit 5b822a7

Browse files
authored
feat: parallel deletion of runners (#588)
* feat: parallel deletion of runners * chore: minor refactor for imports * feat: add delete vms and extract metrics interface * feat: add delete runners interface * feat: delete runner controller implementation * feat: delete runner implementation for GitHub platform * feat: delete runner implementation on jobmanager * feat: parallel vm deletion and metrics extraction * feat: cloud manager + cloud delete vms implementation * chore: reorder helper func * test: runner pull metrics * feat: replace singular delete with parallel deletes * chore: ignore docstring errors in abc * chore: lint fixes * test: redo tests * test: fix openstack fixture * chore: log extracted metric s * debug * chore: debug log * chore: try logging with file * general exception catch w/ traceback * fix: construct connection on subprocess call * chore: minor lint fixes * chore: remove comments * chore: revert debug workflow * chore: tidy up factory comments * chore: remove reusing connection * fix: pyproject merge conflict fix * fix: adapt changes to SelfHostedRunnerLabels * fix: pyproject toml conflicting test ignores * chore: remove todo comments * feat: use thread pool for concurrency * feat: limit max workers * feat: concurrent github runner delete request using threadpool * feat: concurrent metrics fetching using multithreading * chore: rename DeleteVMError * chore: update unused argument comment * chore: rename mock to fake * fix: lint issues w docstring * debug * fix: instantiate vars before assignment * chore: delete unused test bin * chore: pass down delete timeout * chore: update keypair delete to fire and forget * chore: undo debug * fix: lint rules * chore: fix meta docstring * chore: remove unused platform argument after multiplexer ejection * fix: result returns none * fix: only return truely deleted openstack IDs
1 parent b3da781 commit 5b822a7

22 files changed

+1911
-1326
lines changed

github-runner-manager/pyproject.toml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,15 @@ select = ["E", "W", "F", "C", "N", "R", "D", "H"]
7979
# Ignore W503,E203 because using black creates errors with this
8080
# Ignore D107 Missing docstring in __init__
8181
ignore = ["W503", "D107", "E203"]
82-
# D100, D101, D102, D103, D104: Ignore docstring style issues in tests
83-
# DCO020, DCO030, DCO050: Ignore docstring argument,returns,raises sections in tests
84-
per-file-ignores = ["tests/*:D100,D101,D102,D103,D104,D205,D212, DCO020, DCO030, DCO050"]
82+
per-file-ignores = [
83+
# Ignore factory methods attributes docstring
84+
"tests/unit/factories/*:DCO060",
85+
# Ignore no return values (DCO031) in docstring for abstract methods
86+
"src/github_runner_manager/manager/cloud_runner_manager.py:DCO031",
87+
# DCO020, DCO030, DCO050, DCO060: Ignore docstring argument, returns, raises, attribute
88+
# sections in tests
89+
"tests/*:D100,D101,D102,D103,D104,D205,D212,DCO020,DCO030,DCO050,DCO060",
90+
]
8591
docstring-convention = "google"
8692
# Check for properly formatted copyright header in each file
8793
copyright-check = "True"

github-runner-manager/src/github_runner_manager/github_client.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,7 @@
2323
from requests import RequestException
2424
from typing_extensions import assert_never
2525

26-
from github_runner_manager.configuration.github import (
27-
GitHubOrg,
28-
GitHubPath,
29-
GitHubRepo,
30-
)
26+
from github_runner_manager.configuration.github import GitHubOrg, GitHubPath, GitHubRepo
3127
from github_runner_manager.manager.models import InstanceID
3228
from github_runner_manager.platform.platform_provider import (
3329
DeleteRunnerBusyError,

github-runner-manager/src/github_runner_manager/manager/cloud_runner_manager.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -35,20 +35,6 @@ class HealthState(Enum):
3535
UNHEALTHY = auto()
3636
UNKNOWN = auto()
3737

38-
@staticmethod
39-
def from_value(health: bool | None) -> "HealthState":
40-
"""Create from a health value.
41-
42-
Args:
43-
health: The health value as boolean or None.
44-
45-
Returns:
46-
The health state.
47-
"""
48-
if health is None:
49-
return HealthState.UNKNOWN
50-
return HealthState.HEALTHY if health else HealthState.UNHEALTHY
51-
5238

5339
class CloudRunnerState(str, Enum):
5440
"""Represent state of the instance hosting the runner.
@@ -275,11 +261,25 @@ def get_runners(self) -> Sequence[CloudRunnerInstance]:
275261
"""Get cloud self-hosted runners."""
276262

277263
@abc.abstractmethod
278-
def delete_runner(self, instance_id: InstanceID) -> RunnerMetrics | None:
279-
"""Delete self-hosted runner.
264+
def delete_vms(self, instance_ids: Sequence[InstanceID]) -> list[InstanceID]:
265+
"""Delete cloud VM instances.
266+
267+
Args:
268+
instance_ids: The ID of the VMs to request deletion.
269+
270+
Returns:
271+
The deleted instance IDs.
272+
"""
273+
274+
@abc.abstractmethod
275+
def extract_metrics(self, instance_ids: Sequence[InstanceID]) -> list[RunnerMetrics]:
276+
"""Extract metrics from cloud VMs.
280277
281278
Args:
282-
instance_id: The instance id of the runner to delete.
279+
instance_ids: The VM instance IDs to fetch the metrics from.
280+
281+
Returns:
282+
The fetched runner metrics.
283283
"""
284284

285285
@abc.abstractmethod

github-runner-manager/src/github_runner_manager/manager/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ class InstanceID:
3232
"""
3333

3434
prefix: str
35-
reactive: bool | None
3635
suffix: str
36+
reactive: bool | None = None
3737

3838
@property
3939
def name(self) -> str:

github-runner-manager/src/github_runner_manager/manager/runner_manager.py

Lines changed: 83 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,9 @@
2323
from github_runner_manager.metrics import events as metric_events
2424
from github_runner_manager.metrics import github as github_metrics
2525
from github_runner_manager.metrics import runner as runner_metrics
26-
from github_runner_manager.metrics.reconcile import CLEANED_RUNNERS_TOTAL
2726
from github_runner_manager.metrics.runner import RunnerMetrics
2827
from github_runner_manager.openstack_cloud.constants import CREATE_SERVER_TIMEOUT
2928
from github_runner_manager.platform.platform_provider import (
30-
DeleteRunnerBusyError,
3129
PlatformApiError,
3230
PlatformProvider,
3331
PlatformRunnerHealth,
@@ -82,27 +80,33 @@ class RunnerInstance:
8280
platform_state: PlatformRunnerState | None
8381
cloud_state: CloudRunnerState
8482

85-
def __init__(
86-
self,
83+
@classmethod
84+
def from_cloud_and_platform_health(
85+
cls,
8786
cloud_instance: CloudRunnerInstance,
8887
platform_health_state: PlatformRunnerHealth | None,
89-
):
88+
) -> "RunnerInstance":
9089
"""Construct an instance.
9190
9291
Args:
9392
cloud_instance: Information on the cloud instance.
9493
platform_health_state: Health state in the platform provider.
94+
95+
Returns:
96+
The RunnerInstance instantiated from cloud instance and platform state.
9597
"""
96-
self.name = cloud_instance.name
97-
self.instance_id = cloud_instance.instance_id
98-
self.metadata = cloud_instance.metadata
99-
self.health = cloud_instance.health
100-
self.platform_state = (
101-
PlatformRunnerState.from_platform_health(platform_health_state)
102-
if platform_health_state is not None
103-
else None
98+
return cls(
99+
name=cloud_instance.name,
100+
instance_id=cloud_instance.instance_id,
101+
metadata=cloud_instance.metadata,
102+
health=cloud_instance.health,
103+
platform_state=(
104+
PlatformRunnerState.from_platform_health(platform_health_state)
105+
if platform_health_state is not None
106+
else None
107+
),
108+
cloud_state=cloud_instance.state,
104109
)
105-
self.cloud_state = cloud_instance.state
106110

107111

108112
class RunnerManager:
@@ -183,7 +187,7 @@ def get_runners(self) -> tuple[RunnerInstance, ...]:
183187
health_runners_map = {runner.identity.instance_id: runner for runner in runners_health}
184188
for cloud_runner in cloud_runners:
185189
if cloud_runner.instance_id not in health_runners_map:
186-
runner_instance = RunnerInstance(cloud_runner, None)
190+
runner_instance = RunnerInstance.from_cloud_and_platform_health(cloud_runner, None)
187191
runner_instance.health = HealthState.UNKNOWN
188192
runner_instances.append(runner_instance)
189193
continue
@@ -194,7 +198,9 @@ def get_runners(self) -> tuple[RunnerInstance, ...]:
194198
cloud_runner.health = HealthState.HEALTHY
195199
else:
196200
cloud_runner.health = HealthState.UNHEALTHY
197-
runner_instance = RunnerInstance(cloud_runner, health_runner)
201+
runner_instance = RunnerInstance.from_cloud_and_platform_health(
202+
cloud_runner, health_runner
203+
)
198204
runner_instances.append(runner_instance)
199205
return cast(tuple[RunnerInstance], tuple(runner_instances))
200206

@@ -268,7 +274,9 @@ def _cleanup_resources(
268274
)
269275
cloud_runners = self._cloud.get_runners()
270276
logger.info("cleanup cloud_runners %s", cloud_runners)
271-
runners_health_response = self._platform.get_runners_health(cloud_runners)
277+
runners_health_response = self._platform.get_runners_health(
278+
requested_runners=cloud_runners
279+
)
272280
logger.info("cleanup health_response %s", runners_health_response)
273281

274282
# Clean dangling resources in the cloud
@@ -295,7 +303,7 @@ def _cleanup_resources(
295303
)
296304
)
297305

298-
if maximum_runners_to_delete:
306+
if maximum_runners_to_delete is not None:
299307
cloud_runners_to_delete.sort(
300308
key=partial(_runner_deletion_sort_key, health_runners_map)
301309
)
@@ -313,58 +321,70 @@ def _delete_cloud_runners(
313321
runners_health: Sequence[PlatformRunnerHealth],
314322
delete_busy_runners: bool = False,
315323
) -> Iterable[runner_metrics.RunnerMetrics]:
316-
"""Delete runners in the platform ant the cloud.
324+
"""Delete runners in the platform and the cloud.
317325
318326
If delete_busy_runners is False, when the platform provider fails in deleting the
319327
runner because it can be busy, will mean that that runner should not be deleted.
328+
329+
Runners without health information should not be deleted.
320330
"""
321-
extracted_runner_metrics = []
322-
health_runners_map = {health.identity.instance_id: health for health in runners_health}
323-
for cloud_runner in cloud_runners:
324-
logging.info("Trying to delete cloud_runner %s", cloud_runner)
325-
runner_health = health_runners_map.get(cloud_runner.instance_id)
326-
if runner_health and runner_health.runner_in_platform:
327-
try:
328-
self._platform.delete_runner(runner_health.identity)
329-
except DeleteRunnerBusyError:
330-
if not delete_busy_runners:
331-
logger.warning(
332-
"Skipping deletion as the runner is busy. %s", cloud_runner.instance_id
333-
)
334-
continue
335-
logger.info("Deleting busy runner: %s", cloud_runner.instance_id)
336-
except PlatformApiError as exc:
337-
if not delete_busy_runners:
338-
logger.warning(
339-
"Failed to delete platform runner %s. %s. Skipping.",
340-
cloud_runner.instance_id,
341-
exc,
342-
)
343-
continue
344-
logger.warning(
345-
"Deleting runner: %s after platform failure %s.",
346-
cloud_runner.instance_id,
347-
exc,
348-
)
331+
if not cloud_runners:
332+
return []
349333

350-
logging.info("Delete runner in cloud: %s", cloud_runner.instance_id)
351-
runner_metric = self._cloud.delete_runner(cloud_runner.instance_id)
352-
CLEANED_RUNNERS_TOTAL.labels(self.manager_name).inc(1)
353-
if not runner_metric:
354-
logger.error("No metrics returned after deleting %s", cloud_runner.instance_id)
355-
else:
356-
extracted_runner_metrics.append(runner_metric)
357-
return extracted_runner_metrics
334+
runner_identity_map = {
335+
health_info.identity.instance_id: health_info.identity
336+
for health_info in runners_health
337+
}
338+
platform_runner_ids_to_delete = [
339+
# The runner_id cannot be None due to the if condition. the type system
340+
# isn't able to catch that.
341+
cast(str, runner_identity_map[runner.instance_id].metadata.runner_id)
342+
for runner in cloud_runners
343+
if runner.instance_id in runner_identity_map
344+
and runner_identity_map[runner.instance_id].metadata.runner_id
345+
]
346+
logger.info("Deleting runners from platform: %s", platform_runner_ids_to_delete)
347+
deleted_runner_ids = self._platform.delete_runners(
348+
runner_ids=platform_runner_ids_to_delete
349+
)
350+
logger.info(
351+
"Deleted runners from platform: %s (diff: %s)",
352+
deleted_runner_ids,
353+
set(platform_runner_ids_to_delete) - set(deleted_runner_ids),
354+
)
355+
356+
logger.info("Cloud runners: %s", cloud_runners)
357+
cloud_vm_ids_to_delete = [
358+
runner.instance_id
359+
for runner in cloud_runners
360+
# We can delete all VMs if delete_busy_runners is True
361+
if delete_busy_runners
362+
# We can delete the VM if no runner is associated with it
363+
or not runner.metadata.runner_id
364+
# We can delete the VM if it has been deleted from the Platform provider.
365+
or runner.metadata.runner_id in deleted_runner_ids
366+
]
367+
logger.info("Extracting metrics from cloud VMs: %s", cloud_vm_ids_to_delete)
368+
extracted_metrics = self._cloud.extract_metrics(instance_ids=cloud_vm_ids_to_delete)
369+
logger.info("Extracted metrics from cloud VMs: %s", extracted_metrics)
370+
logger.info("Deleting VMs %s", cloud_vm_ids_to_delete)
371+
deleted_vm_ids = self._cloud.delete_vms(instance_ids=cloud_vm_ids_to_delete)
372+
logger.info(
373+
"Deleted VMs: %s, (diff: %s)",
374+
deleted_vm_ids,
375+
set(cloud_vm_ids_to_delete) - set(deleted_vm_ids),
376+
)
377+
return tuple(extracted_metrics)
358378

359379
def _clean_platform_runners(self, runners: list[RunnerIdentity]) -> None:
360380
"""Clean the specified runners in the platform."""
361-
for runner in runners:
362-
try:
363-
self._platform.delete_runner(runner)
364-
except DeleteRunnerBusyError:
365-
logger.warning("Tried to delete busy runner in cleanup %s", runner)
366-
except PlatformApiError:
367-
logger.warning("Failed to delete platform runner %s", runner)
381+
if not runners:
382+
return
383+
384+
runner_ids_to_delete = [
385+
runner.metadata.runner_id for runner in runners if runner.metadata.runner_id
386+
]
387+
self._platform.delete_runners(runner_ids=runner_ids_to_delete)
368388

369389
@staticmethod
370390
def _spawn_runners(
@@ -525,7 +545,7 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceID:
525545
)
526546
except RunnerError:
527547
logger.warning("Deleting runner %s from platform after creation failed", instance_id)
528-
args.platform_provider.delete_runner(runner_info.identity)
548+
args.platform_provider.delete_runners(runner_ids=[args.metadata.runner_id])
529549
raise
530550
return instance_id
531551

github-runner-manager/src/github_runner_manager/manager/runner_scaler.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class _ReconcileMetricData:
9494
start_timestamp: float
9595
end_timestamp: float
9696
metric_stats: IssuedMetricEventsStats
97-
runner_list: tuple[RunnerInstance]
97+
runner_list: tuple[RunnerInstance, ...]
9898
flavor: str
9999
expected_runner_quantity: int
100100

@@ -375,7 +375,7 @@ def _reconcile_non_reactive(self, expected_quantity: int) -> _ReconcileResult:
375375
return _ReconcileResult(runner_diff=runner_diff, metric_stats=metric_stats)
376376

377377
@staticmethod
378-
def _log_runners(runner_list: tuple[RunnerInstance]) -> None:
378+
def _log_runners(runner_list: tuple[RunnerInstance, ...]) -> None:
379379
"""Log information about the runners found.
380380
381381
Args:
@@ -446,7 +446,6 @@ def _issue_reconciliation_metric(
446446
IDLE_RUNNERS_COUNT.labels(manager_name).set(len(idle_runners))
447447

448448
try:
449-
450449
metric_events.issue_event(
451450
metric_events.Reconciliation(
452451
timestamp=time.time(),

0 commit comments

Comments
 (0)