Skip to content

Commit b51b110

Browse files
Remove SSH creation health checks (#534)
1 parent e4e0901 commit b51b110

20 files changed

+711
-128
lines changed

docs/changelog.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
- Add a visualization of the share of jobs started per application.
77

8-
98
### 2025-04-22
109

1110
- Add how-to landing page.

github-runner-manager/src/github_runner_manager/errors.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,6 @@ class RunnerCreateError(RunnerError):
1313
"""Error for runner creation failure."""
1414

1515

16-
class RunnerStartError(RunnerError):
17-
"""Error for runner start failure."""
18-
19-
2016
class MissingServerConfigError(RunnerError):
2117
"""Error for unable to create runner due to missing server configurations."""
2218

@@ -46,7 +42,7 @@ class TokenError(PlatformClientError):
4642

4743

4844
class JobNotFoundError(PlatformClientError):
49-
"""Represents an error when the job could not be found on GitHub."""
45+
"""Represents an error when the job could not be found on the platform."""
5046

5147

5248
class CloudError(Exception):

github-runner-manager/src/github_runner_manager/github_client.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
from urllib.error import HTTPError
1414

1515
import requests
16+
17+
# HTTP404NotFoundError is not found by pylint
18+
from fastcore.net import HTTP404NotFoundError # pylint: disable=no-name-in-module
1619
from ghapi.all import GhApi, pages
1720
from ghapi.page import paged
1821
from requests import RequestException
@@ -29,6 +32,11 @@
2932

3033
logger = logging.getLogger(__name__)
3134

35+
36+
class GithubRunnerNotFoundError(Exception):
37+
"""Represents an error when the runner could not be found on GitHub."""
38+
39+
3240
# Parameters of the function decorated with retry
3341
ParamT = ParamSpec("ParamT")
3442
# Return type of the function decorated with retry
@@ -89,8 +97,38 @@ def __init__(self, token: str):
8997
self._client = GhApi(token=self._token)
9098

9199
@catch_http_errors
92-
def get_runner_github_info(self, path: GitHubPath, prefix: str) -> list[SelfHostedRunner]:
93-
"""Get runner information on GitHub under a repo or org.
100+
def get_runner(self, path: GitHubPath, prefix: str, runner_id: int) -> SelfHostedRunner:
101+
"""Get a specific self-hosted runner information under a repo or org.
102+
103+
Args:
104+
path: GitHub repository path in the format '<owner>/<repo>', or the GitHub organization
105+
name.
106+
prefix: Build the InstanceID with this prefix.
107+
runner_id: Runner id to get the self hosted runner.
108+
109+
Raises:
110+
GithubRunnerNotFoundError: If the runner is not found.
111+
112+
Returns:
113+
The information for the requested runner.
114+
"""
115+
try:
116+
if isinstance(path, GitHubRepo):
117+
raw_runner = self._client.actions.get_self_hosted_runner_for_repo(
118+
path.owner, path.repo, runner_id
119+
)
120+
else:
121+
raw_runner = self._client.actions.get_self_hosted_runner_for_org(
122+
path.org, runner_id
123+
)
124+
except HTTP404NotFoundError as err:
125+
raise GithubRunnerNotFoundError from err
126+
instance_id = InstanceID.build_from_name(prefix, raw_runner["name"])
127+
return SelfHostedRunner.build_from_github(raw_runner, instance_id)
128+
129+
@catch_http_errors
130+
def list_runners(self, path: GitHubPath, prefix: str) -> list[SelfHostedRunner]:
131+
"""Get all runners information on GitHub under a repo or org.
94132
95133
Args:
96134
path: GitHub repository path in the format '<owner>/<repo>', or the GitHub organization

github-runner-manager/src/github_runner_manager/manager/runner_manager.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import copy
77
import logging
8+
import time
89
from dataclasses import dataclass
910
from enum import Enum, auto
1011
from multiprocessing import Pool
@@ -23,11 +24,19 @@
2324
from github_runner_manager.metrics import github as github_metrics
2425
from github_runner_manager.metrics import runner as runner_metrics
2526
from github_runner_manager.metrics.runner import RunnerMetrics
26-
from github_runner_manager.platform.platform_provider import PlatformProvider, PlatformRunnerState
27+
from github_runner_manager.platform.platform_provider import (
28+
PlatformProvider,
29+
PlatformRunnerState,
30+
)
2731
from github_runner_manager.types_.github import SelfHostedRunner
2832

2933
logger = logging.getLogger(__name__)
3034

35+
# After a runner is created, there will be as many health checks as
36+
# elements in this variable. The elements in the tuple represent
37+
# the time waiting before each health check against the platform provider.
38+
RUNNER_CREATION_WAITING_TIMES = (60, 60, 120, 240, 480)
39+
3140
IssuedMetricEventsStats = dict[Type[metric_events.Event], int]
3241

3342

@@ -474,16 +483,59 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceID:
474483
args.metadata.runner_id = str(github_runner.id)
475484

476485
try:
477-
args.cloud_runner_manager.create_runner(
486+
cloud_instance = args.cloud_runner_manager.create_runner(
478487
instance_id=instance_id,
479488
metadata=args.metadata,
480489
runner_context=runner_context,
481490
)
491+
492+
# This wait should be deleted to make the runner creation as
493+
# quick as possible. The waiting should only be done in the
494+
# reactive case, before checking that a job was taken.
495+
RunnerManager.wait_for_runner_online(
496+
platform_provider=args.platform_provider,
497+
instance_id=cloud_instance.instance_id,
498+
metadata=cloud_instance.metadata,
499+
)
500+
482501
except RunnerError:
483-
# try to clean the runner in GitHub. This is necessary, as for reactive runners
484-
# we do not know in the clean up if the runner is offline because if failed or
485-
# because it is being created.
486-
logger.warning("Deleting runner %s from platform", instance_id)
502+
logger.warning("Deleting runner %s from platform after creation failed", instance_id)
487503
args.platform_provider.delete_runners([github_runner])
488504
raise
489505
return instance_id
506+
507+
@staticmethod
508+
def wait_for_runner_online(
509+
platform_provider: PlatformProvider,
510+
instance_id: InstanceID,
511+
metadata: RunnerMetadata,
512+
) -> None:
513+
"""Wait until the runner is online.
514+
515+
The constant RUNNER_CREATION_WAITING_TIMES defines the time before calling
516+
the platform provider to check if the runner is online. Besides online runner,
517+
deletable runner will also be equivalent to online, as no more waiting should
518+
be needed.
519+
520+
Args:
521+
platform_provider: Platform provider to use for health checks.
522+
instance_id: InstanceID for the runner to wait for.
523+
metadata: Metadata for the runner to wait for.
524+
525+
Raises:
526+
RunnerError: If the runner did not come online after the specified time.
527+
528+
"""
529+
for wait_time in RUNNER_CREATION_WAITING_TIMES:
530+
time.sleep(wait_time)
531+
try:
532+
runner_health = platform_provider.get_runner_health(
533+
metadata=metadata, instance_id=instance_id
534+
)
535+
except PlatformApiError as exc:
536+
logger.error("Error getting the runner health: %s", exc)
537+
continue
538+
if runner_health.online or runner_health.deletable:
539+
break
540+
else:
541+
raise RunnerError(f"Runner {instance_id} did not get online")

github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py

Lines changed: 0 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from typing import Iterable, Sequence
1111

1212
import fabric
13-
import invoke
1413
import jinja2
1514
import paramiko
1615
from fabric import Connection as SSHConnection
@@ -22,11 +21,9 @@
2221
OpenStackError,
2322
OpenstackHealthCheckError,
2423
RunnerCreateError,
25-
RunnerStartError,
2624
SSHError,
2725
)
2826
from github_runner_manager.manager.cloud_runner_manager import (
29-
CloudInitStatus,
3027
CloudRunnerInstance,
3128
CloudRunnerManager,
3229
CloudRunnerState,
@@ -158,11 +155,6 @@ def create_runner(
158155
except OpenStackError as err:
159156
raise RunnerCreateError(f"Failed to create {instance_id} openstack runner") from err
160157

161-
logger.info("Waiting for runner process to startup: %s", instance.instance_id)
162-
self._wait_runner_startup(instance)
163-
logger.info("Waiting for runner process to be running: %s", instance.instance_id)
164-
self._wait_runner_running(instance)
165-
166158
logger.info("Runner %s created successfully", instance.instance_id)
167159
return self._build_cloud_runner_instance(instance)
168160

@@ -508,80 +500,6 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non
508500
result.stderr,
509501
)
510502

511-
@retry(tries=10, delay=60, local_logger=logger)
512-
def _wait_runner_startup(self, instance: OpenstackInstance) -> None:
513-
"""Wait until runner is startup.
514-
515-
Args:
516-
instance: The runner instance.
517-
518-
Raises:
519-
RunnerStartError: The runner startup process was not found on the runner.
520-
"""
521-
try:
522-
ssh_conn = self._openstack_cloud.get_ssh_connection(instance)
523-
except SSHError as err:
524-
raise RunnerStartError(
525-
f"Failed to SSH to {instance.instance_id} during creation possible due to setup "
526-
"not completed"
527-
) from err
528-
529-
logger.debug("Running `cloud-init status` on instance %s.", instance.instance_id)
530-
result: invoke.runners.Result = ssh_conn.run("cloud-init status", warn=True, timeout=60)
531-
if not result.ok:
532-
logger.warning(
533-
"cloud-init status command failed on %s: %s.", instance.instance_id, result.stderr
534-
)
535-
raise RunnerStartError(f"Runner startup process not found on {instance.instance_id}")
536-
# A short running job may have already completed and exited the runner, hence check the
537-
# condition via cloud-init status check.
538-
if CloudInitStatus.DONE in result.stdout:
539-
return
540-
logger.debug("Running `ps aux` on instance %s.", instance.instance_id)
541-
result = ssh_conn.run("ps aux", warn=True, timeout=60, hide=True)
542-
if not result.ok:
543-
logger.warning("SSH run of `ps aux` failed on %s", instance.instance_id)
544-
raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.instance_id}")
545-
# Runner startup process is the parent process of runner.Listener and runner.Worker which
546-
# starts up much faster.
547-
if RUNNER_STARTUP_PROCESS not in result.stdout:
548-
logger.warning("Runner startup process not found on %s", instance.instance_id)
549-
raise RunnerStartError(f"Runner startup process not found on {instance.instance_id}")
550-
logger.info("Runner startup process found to be healthy on %s", instance.instance_id)
551-
552-
@retry(tries=5, delay=60, local_logger=logger)
553-
def _wait_runner_running(self, instance: OpenstackInstance) -> None:
554-
"""Wait until runner is running.
555-
556-
Args:
557-
instance: The runner instance.
558-
559-
Raises:
560-
RunnerStartError: The runner process was not found on the runner.
561-
"""
562-
try:
563-
ssh_conn = self._openstack_cloud.get_ssh_connection(instance)
564-
except SSHError as err:
565-
raise RunnerStartError(
566-
f"Failed to SSH connect to {instance.instance_id} openstack runner"
567-
) from err
568-
569-
try:
570-
healthy = health_checks.check_active_runner(
571-
ssh_conn=ssh_conn, instance=instance, accept_finished_job=True
572-
)
573-
except OpenstackHealthCheckError as exc:
574-
raise RunnerStartError(
575-
f"Failed to check health of runner process on {instance.instance_id}"
576-
) from exc
577-
if not healthy:
578-
logger.info("Runner %s not considered healthy", instance.instance_id)
579-
raise RunnerStartError(
580-
f"Runner {instance.instance_id} failed to initialize after starting"
581-
)
582-
583-
logger.info("Runner %s found to be healthy", instance.instance_id)
584-
585503
@staticmethod
586504
def _run_runner_removal_script(
587505
instance_id: InstanceID, ssh_conn: SSHConnection, remove_token: str

github-runner-manager/src/github_runner_manager/platform/github_provider.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,16 @@
1111

1212
from github_runner_manager.configuration.github import GitHubConfiguration, GitHubRepo
1313
from github_runner_manager.errors import JobNotFoundError as GithubJobNotFoundError
14-
from github_runner_manager.github_client import GithubClient
14+
from github_runner_manager.github_client import GithubClient, GithubRunnerNotFoundError
1515
from github_runner_manager.manager.models import InstanceID, RunnerContext, RunnerMetadata
1616
from github_runner_manager.platform.platform_provider import (
1717
JobInfo,
1818
JobNotFoundError,
1919
PlatformProvider,
20+
PlatformRunnerHealth,
2021
PlatformRunnerState,
2122
)
22-
from github_runner_manager.types_.github import SelfHostedRunner
23+
from github_runner_manager.types_.github import GitHubRunnerStatus, SelfHostedRunner
2324

2425
logger = logging.getLogger(__name__)
2526

@@ -58,6 +59,40 @@ def build(
5859
github_client=GithubClient(github_configuration.token),
5960
)
6061

62+
def get_runner_health(
63+
self,
64+
metadata: RunnerMetadata,
65+
instance_id: InstanceID,
66+
) -> PlatformRunnerHealth:
67+
"""Get information on the health of a github runner.
68+
69+
Args:
70+
metadata: Metadata for the runner.
71+
instance_id: Instance ID of the runner.
72+
73+
Returns:
74+
Information about the health status of the runner.
75+
"""
76+
try:
77+
runner = self._client.get_runner(self._path, self._prefix, int(metadata.runner_id))
78+
online = runner.status == GitHubRunnerStatus.ONLINE
79+
return PlatformRunnerHealth(
80+
instance_id=instance_id,
81+
metadata=metadata,
82+
online=online,
83+
busy=runner.busy,
84+
deletable=False,
85+
)
86+
87+
except GithubRunnerNotFoundError:
88+
return PlatformRunnerHealth(
89+
instance_id=instance_id,
90+
metadata=metadata,
91+
online=False,
92+
busy=False,
93+
deletable=True,
94+
)
95+
6196
def get_runners(
6297
self, states: Iterable[PlatformRunnerState] | None = None
6398
) -> tuple[SelfHostedRunner, ...]:
@@ -69,7 +104,7 @@ def get_runners(
69104
Returns:
70105
Information on the runners.
71106
"""
72-
runner_list = self._client.get_runner_github_info(self._path, self._prefix)
107+
runner_list = self._client.list_runners(self._path, self._prefix)
73108

74109
if states is None:
75110
return tuple(runner_list)

0 commit comments

Comments
 (0)