Skip to content

Commit 954f18a

Browse files
authored
[ISD-3696] Feat: No longer wait for runner to come online during runner creation (#570)
1 parent 7c05feb commit 954f18a

18 files changed

+95
-207
lines changed

github-runner-manager/src/github_runner_manager/manager/runner_manager.py

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
import copy
77
import logging
8-
import time
98
from dataclasses import dataclass
109
from enum import Enum, auto
1110
from functools import partial
@@ -522,55 +521,12 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceID:
522521
runner_identity=runner_identity,
523522
runner_context=runner_context,
524523
)
525-
526-
# This wait should be deleted to make the runner creation as
527-
# quick as possible. The waiting should only be done in the
528-
# reactive case, before checking that a job was taken.
529-
RunnerManager.wait_for_runner_online(
530-
platform_provider=args.platform_provider,
531-
runner_identity=runner_identity,
532-
)
533-
534524
except RunnerError:
535525
logger.warning("Deleting runner %s from platform after creation failed", instance_id)
536526
args.platform_provider.delete_runner(runner_info.identity)
537527
raise
538528
return instance_id
539529

540-
@staticmethod
541-
def wait_for_runner_online(
542-
platform_provider: PlatformProvider,
543-
runner_identity: RunnerIdentity,
544-
) -> None:
545-
"""Wait until the runner is online.
546-
547-
The constant RUNNER_CREATION_WAITING_TIMES defines the time before calling
548-
the platform provider to check if the runner is online. Besides online runner,
549-
deletable runner will also be equivalent to online, as no more waiting should
550-
be needed.
551-
552-
Args:
553-
platform_provider: Platform provider to use for health checks.
554-
runner_identity: Identity of the runner.
555-
556-
Raises:
557-
RunnerError: If the runner did not come online after the specified time.
558-
559-
"""
560-
for wait_time in RUNNER_CREATION_WAITING_TIMES:
561-
time.sleep(wait_time)
562-
try:
563-
runner_health = platform_provider.get_runner_health(runner_identity)
564-
except PlatformApiError:
565-
logger.exception("Error getting the runner health: %s", runner_identity)
566-
continue
567-
if runner_health.online or runner_health.deletable:
568-
logger.info("Runner %s online", runner_identity)
569-
break
570-
logger.info("Runner %s not yet online", runner_identity)
571-
else:
572-
raise RunnerError(f"Runner {runner_identity} did not get online")
573-
574530

575531
def _filter_runner_to_delete(
576532
cloud_runner: CloudRunnerInstance,

github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_cloud.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ def launch_instance(
253253
userdata=cloud_init,
254254
auto_ip=False,
255255
timeout=CREATE_SERVER_TIMEOUT,
256-
wait=True,
256+
wait=False,
257257
meta=meta,
258258
)
259259
except openstack.exceptions.ResourceTimeout as err:

github-runner-manager/src/github_runner_manager/reactive/consumer.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -240,11 +240,9 @@ def _spawn_runner(
240240
return
241241
logger.info("Reactive runner spawned %s", instance_ids)
242242

243-
for iteration in range(5):
244-
# Do not sleep on the first iteration — the job might already be taken.
243+
for _ in range(5):
244+
sleep(60)
245245
logger.info("Checking if job picked up for reactive runner %s", instance_ids)
246-
if iteration != 0:
247-
sleep(60)
248246
if platform_provider.check_job_been_picked_up(metadata=metadata, job_url=job_url):
249247
logger.info("Job picked %s. reactive runner ok %s", job_url, instance_ids)
250248
msg.ack()

github-runner-manager/tests/unit/manager/test_runner_manager.py

Lines changed: 4 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@
33

44
"""Unit tests for the the runner_manager."""
55

6-
from unittest.mock import ANY, MagicMock
6+
from unittest.mock import MagicMock
77

88
import pytest
99

1010
from github_runner_manager.errors import RunnerCreateError
11-
from github_runner_manager.manager import runner_manager as runner_manager_module
1211
from github_runner_manager.manager.cloud_runner_manager import (
1312
CloudRunnerInstance,
1413
CloudRunnerManager,
@@ -22,7 +21,6 @@
2221
from github_runner_manager.manager.runner_manager import RunnerManager
2322
from github_runner_manager.platform.platform_provider import (
2423
PlatformProvider,
25-
PlatformRunnerHealth,
2624
RunnersHealthResponse,
2725
)
2826
from github_runner_manager.types_.github import GitHubRunnerStatus, SelfHostedRunner
@@ -104,55 +102,12 @@ def _get_runner_context(instance_id, metadata, labels):
104102
github_provider.delete_runner.assert_called_once_with(github_runner.identity)
105103

106104

107-
@pytest.mark.parametrize(
108-
"creation_waiting_times,runner_unhealthy,runner_healthy",
109-
[
110-
pytest.param(
111-
(0,),
112-
None,
113-
PlatformRunnerHealth(
114-
identity=MagicMock(),
115-
online=True,
116-
busy=False,
117-
deletable=False,
118-
),
119-
id="online runner",
120-
),
121-
pytest.param(
122-
(0, 0),
123-
PlatformRunnerHealth(
124-
identity=MagicMock(),
125-
online=False,
126-
busy=True,
127-
deletable=False,
128-
),
129-
PlatformRunnerHealth(
130-
identity=MagicMock(),
131-
online=False,
132-
busy=False,
133-
deletable=True,
134-
),
135-
id="deletable runner",
136-
),
137-
],
138-
)
139-
def test_create_runner(
140-
monkeypatch: pytest.MonkeyPatch,
141-
creation_waiting_times: tuple[int, ...],
142-
runner_unhealthy: PlatformRunnerHealth | None,
143-
runner_healthy: PlatformRunnerHealth,
144-
):
105+
def test_create_runner() -> None:
145106
"""
146-
arrange: Given a specific pattern for creation waiting times and a list of.
147-
PlatformRunnerHealth objects being the last one a healthy runner.
107+
arrange: None.
148108
act: call runner_manager.create_runners.
149-
assert: The runner manager will create the runner and make requests to check the health
150-
until it gets a healthy state.
109+
assert: The runner manager will create the runner.
151110
"""
152-
monkeypatch.setattr(
153-
runner_manager_module, "RUNNER_CREATION_WAITING_TIMES", creation_waiting_times
154-
)
155-
156111
cloud_runner_manager = MagicMock(spec=CloudRunnerManager)
157112
cloud_runner_manager.name_prefix = "unit-0"
158113

@@ -161,10 +116,6 @@ def test_create_runner(
161116
github_runner = MagicMock()
162117
platform_provider.get_runner_context.return_value = (runner_context_mock, github_runner)
163118

164-
platform_provider.get_runner_health.side_effect = tuple(
165-
runner_unhealthy for _ in range(len(creation_waiting_times) - 1)
166-
) + (runner_healthy,)
167-
168119
runner_manager = RunnerManager(
169120
"managername",
170121
platform_provider=platform_provider,
@@ -176,54 +127,3 @@ def test_create_runner(
176127

177128
assert instance_id
178129
cloud_runner_manager.create_runner.assert_called_once()
179-
# The method to get the runner health was called three times
180-
# until the runner was online.
181-
assert platform_provider.get_runner_health.call_count == len(creation_waiting_times)
182-
platform_provider.get_runner_health.assert_called()
183-
184-
185-
def test_create_runner_failed_waiting(monkeypatch: pytest.MonkeyPatch):
186-
"""
187-
arrange: Given a specific pattern for creation waiting times and a list of.
188-
PlatformRunnerHealth objects where none is healthy
189-
act: call runner_manager.create_runners.
190-
assert: The runner manager will create the runner, it will check for the health state,
191-
but the runner will not get into healthy state and the platform api for deleting
192-
the runner will be called.
193-
"""
194-
runner_creation_waiting_times = (0, 0)
195-
monkeypatch.setattr(
196-
runner_manager_module, "RUNNER_CREATION_WAITING_TIMES", runner_creation_waiting_times
197-
)
198-
199-
cloud_runner_manager = MagicMock(spec=CloudRunnerManager)
200-
cloud_runner_manager.name_prefix = "unit-0"
201-
202-
platform_provider = MagicMock(spec=PlatformProvider)
203-
runner_context_mock = MagicMock()
204-
github_runner = MagicMock()
205-
platform_provider.get_runner_context.return_value = (runner_context_mock, github_runner)
206-
207-
health_offline = PlatformRunnerHealth(
208-
identity=MagicMock(), online=False, busy=False, deletable=False
209-
)
210-
211-
platform_provider.get_runner_health.side_effect = (
212-
health_offline,
213-
health_offline,
214-
)
215-
216-
runner_manager = RunnerManager(
217-
"managername",
218-
platform_provider=platform_provider,
219-
cloud_runner_manager=cloud_runner_manager,
220-
labels=["label1", "label2"],
221-
)
222-
223-
() = runner_manager.create_runners(1, RunnerMetadata(), True)
224-
225-
# The runner was started even if it failed.
226-
cloud_runner_manager.create_runner.assert_called_once()
227-
assert platform_provider.get_runner_health.call_count == 2
228-
platform_provider.get_runner_health.assert_called()
229-
platform_provider.delete_runner.assert_called_once_with(ANY)

github-runner-manager/tests/unit/test_runner_scaler.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,6 @@ def runner_manager_fixture(
121121
cloud_runner_manager=mock_cloud,
122122
labels=["label1", "label2", "arm64", "noble", "flavorlabel"],
123123
)
124-
# We do not want to wait in the unit tests for machines to be ready.
125-
monkeypatch.setattr(runner_manager_module, "RUNNER_CREATION_WAITING_TIMES", (0,))
126124
return runner_manager
127125

128126

tests/integration/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
MONGODB_APP_NAME,
4242
deploy_github_runner_charm,
4343
wait_for,
44-
wait_for_reconcile,
44+
wait_for_runner_ready,
4545
)
4646
from tests.integration.helpers.openstack import OpenStackInstanceHelper, PrivateEndpointConfigs
4747
from tests.status_name import ACTIVE
@@ -485,7 +485,7 @@ async def app_scheduled_events_fixture(
485485
await application.set_config({"reconcile-interval": "8"})
486486
await application.set_config({BASE_VIRTUAL_MACHINES_CONFIG_NAME: "1"})
487487
await model.wait_for_idle(apps=[application.name], status=ACTIVE, timeout=20 * 60)
488-
await wait_for_reconcile(app=application, model=model)
488+
await wait_for_runner_ready(app=application)
489489
return application
490490

491491

tests/integration/helpers/common.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,17 +101,16 @@ async def get_reconcile_id(unit: Unit) -> str:
101101
return stdout
102102

103103

104-
async def wait_for_reconcile(app: Application, model: Model) -> None:
104+
async def wait_for_reconcile(app: Application) -> None:
105105
"""Wait until a reconcile has happened.
106106
107107
Uses the first unit found in the application.
108108
109109
Args:
110110
app: The GitHub Runner Charm application.
111-
model: The machine charm model.
112111
"""
113112
# Wait the application is actively reconciling. Avoid waiting for image, etc.
114-
await model.wait_for_idle(apps=[app.name], status=ACTIVE)
113+
await app.model.wait_for_idle(apps=[app.name], status=ACTIVE)
115114

116115
unit = app.units[0]
117116
base_id = await get_reconcile_id(unit)
@@ -122,6 +121,29 @@ async def wait_for_reconcile(app: Application, model: Model) -> None:
122121
return
123122

124123

124+
async def wait_for_runner_ready(app: Application) -> None:
125+
"""Wait until a runner is ready.
126+
127+
Uses the first unit found in the application.
128+
129+
Args:
130+
app: The GitHub Runner Charm application.
131+
"""
132+
await wait_for_reconcile(app)
133+
134+
# Wait for 10 minutes for the runner to come online.
135+
for _ in range(20):
136+
action = await app.units[0].run_action("check-runners")
137+
await action.wait()
138+
139+
if action.status == "completed" and int(action.results["online"]) >= 1:
140+
break
141+
142+
await sleep(30)
143+
else:
144+
assert False, "Timeout waiting for runner to be ready"
145+
146+
125147
async def deploy_github_runner_charm(
126148
model: Model,
127149
charm_file: str,

tests/integration/helpers/openstack.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@
1212
from openstack.compute.v2.server import Server
1313

1414
from charm_state import BASE_VIRTUAL_MACHINES_CONFIG_NAME
15-
from tests.integration.helpers.common import run_in_unit, wait_for, wait_for_reconcile
15+
from tests.integration.helpers.common import (
16+
run_in_unit,
17+
wait_for,
18+
wait_for_runner_ready,
19+
)
1620

1721
logger = logging.getLogger(__name__)
1822

@@ -152,7 +156,7 @@ async def _set_app_runner_amount(app: Application, num_runners: int) -> None:
152156
num_runners: The number of runners.
153157
"""
154158
await app.set_config({BASE_VIRTUAL_MACHINES_CONFIG_NAME: f"{num_runners}"})
155-
await wait_for_reconcile(app=app, model=app.model)
159+
await wait_for_runner_ready(app=app)
156160

157161
async def get_runner_names(self, unit: Unit) -> list[str]:
158162
"""Get the name of all the runners in the unit.

tests/integration/jobmanager/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ async def app_fixture(
9898
PATH_CONFIG_NAME: jobmanager_base_url,
9999
}
100100
)
101-
await wait_for_reconcile(app_for_jobmanager, app_for_jobmanager.model)
101+
await wait_for_reconcile(app_for_jobmanager)
102102

103103
httpserver.clear_all_handlers()
104104

@@ -111,4 +111,4 @@ async def app_fixture(
111111
RECONCILE_INTERVAL_CONFIG_NAME: str(DEFAULT_RECONCILE_INTERVAL),
112112
}
113113
)
114-
await wait_for_reconcile(app_for_jobmanager, app_for_jobmanager.model)
114+
await wait_for_reconcile(app_for_jobmanager)

tests/integration/jobmanager/test_jobmanager_prespawned.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
RECONCILE_INTERVAL_CONFIG_NAME,
1616
)
1717
from tests.integration.conftest import DEFAULT_RECONCILE_INTERVAL
18-
from tests.integration.helpers.common import wait_for, wait_for_reconcile
18+
from tests.integration.helpers.common import wait_for, wait_for_reconcile, wait_for_runner_ready
1919
from tests.integration.helpers.openstack import OpenStackInstanceHelper
2020
from tests.integration.jobmanager.helpers import (
2121
GetRunnerHealthEndpoint,
@@ -147,7 +147,7 @@ async def _prepare_runner() -> bool:
147147
# TMP: hack to trigger reconcile by changing the configuration, which cause config_changed hook
148148
# to restart the reconcile service.
149149
await app.set_config({RECONCILE_INTERVAL_CONFIG_NAME: str(DEFAULT_RECONCILE_INTERVAL + 1)})
150-
await wait_for_reconcile(app, app.model)
150+
await wait_for_runner_ready(app)
151151

152152
# At this point there should be a runner
153153
await _assert_runners(app, online=1, busy=1, offline=0, unknown=0)
@@ -159,6 +159,6 @@ async def _prepare_runner() -> bool:
159159
# TMP: hack to trigger reconcile by changing the configuration, which cause config_changed hook
160160
# to restart the reconcile service.
161161
await app.set_config({RECONCILE_INTERVAL_CONFIG_NAME: str(DEFAULT_RECONCILE_INTERVAL + 2)})
162-
await wait_for_reconcile(app, app.model)
162+
await wait_for_reconcile(app)
163163

164164
await _assert_runners(app, online=0, busy=0, offline=0, unknown=0)

0 commit comments

Comments
 (0)