Skip to content

Commit a212247

Browse files
authored
feat: pre-spawned mode for jobmanager platform (#567)
* fix reconciliation timeout * update CODEOWNERS * update openapi client * use path for jobmanager_url * update requirements.txt * update jobmanager * update jobmanager_provider * update multiplexer platform * update jobmanager * update jobmanager_provider * update integration test * add clouds.yaml to juju ignore * WIP checkin * specify platform in runner scaler * WIP adaption for pre-spawned test in jobmanager * clean pre-spawned test * set oneshot request handler before normal handler * add unit on top * add mongodb endpoints * add jubilant * fix get_mongodb_uri for case where multiple secrets are there * add test for reactive mode * reuse model from ops_test for integration test * fix jubilant scope * don't mention JobManager yet * remove todo * wait that all agents are idle * Refactor integration test * remove unnecessary integration test * handle mongodb relation broken * fix wf file * lint * remove unused constants in charm.py * split integration tests * add back ending new line * remove empty line comment * move Platform enum to multiplexer * add job_id validation * add more validation * fix job_path_prefix in validation
1 parent 103c10d commit a212247

File tree

90 files changed

+7024
-1916
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

90 files changed

+7024
-1916
lines changed

.github/workflows/integration_test.yaml

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,6 @@ concurrency:
1212
cancel-in-progress: true
1313

1414
jobs:
15-
# test option values defined at test/conftest.py are passed on via repository secret
16-
# INTEGRATION_TEST_ARGS to operator-workflows automatically.
17-
integration-tests:
18-
name: Integration test with juju 3.1
19-
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
20-
secrets: inherit
21-
with:
22-
juju-channel: 3.1/stable
23-
pre-run-script: scripts/setup-integration-tests.sh
24-
provider: lxd
25-
test-tox-env: integration-juju3.1
26-
modules: '["test_charm_scheduled_events", "test_debug_ssh", "test_charm_upgrade"]'
27-
extra-arguments: '-m openstack --log-format="%(asctime)s %(levelname)s %(message)s"'
28-
self-hosted-runner: true
29-
self-hosted-runner-label: stg-private-endpoint
30-
test-timeout: 90
3115
openstack-interface-tests-private-endpoint:
3216
name: openstack interface test using private-endpoint
3317
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
@@ -50,14 +34,13 @@ jobs:
5034
pre-run-script: scripts/setup-integration-tests.sh
5135
provider: lxd
5236
test-tox-env: integration-juju3.6
53-
modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_fork_path_change", "test_charm_no_runner", "test_charm_runner", "test_reactive", "test_jobmanager"]'
37+
modules: '["test_charm_metrics_failure", "test_charm_metrics_success", "test_charm_fork_repo", "test_charm_fork_path_change", "test_charm_no_runner", "test_charm_runner", "test_debug_ssh", "test_charm_upgrade", "test_reactive", "test_jobmanager_prespawned", "test_jobmanager_reactive"]'
5438
extra-arguments: '-m openstack --log-format="%(asctime)s %(levelname)s %(message)s"'
5539
self-hosted-runner: true
5640
self-hosted-runner-label: stg-private-endpoint
5741
allure-report:
5842
if: ${{ (success() || failure()) && github.event_name == 'schedule' }}
5943
needs:
60-
- integration-tests
6144
- openstack-interface-tests-private-endpoint
6245
- openstack-integration-tests-private-endpoint
6346
uses: canonical/operator-workflows/.github/workflows/allure_report.yaml@main

.jujuignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ tests
66
src-docs
77
docs
88
github-runner-operator
9+
clouds.yaml

docs/changelog.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ network/infrastructure specific things.
4141

4242
### 2025-05-06
4343

44-
- The ssh health checks are removed and the platform providers (GitHub or the JobManager) are used instead to get the runners health
44+
- The ssh health checks are removed and GitHub is used instead to get the runners health
4545
information. This implies many changes in both the structure of the project and its functionality. Potentially, many race conditions should
46-
disappear for the GitHub case.
46+
disappear.
4747

4848
### 2025-04-28
4949

github-runner-manager/src/github_runner_manager/configuration/base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import yaml
1111
from pydantic import AnyHttpUrl, BaseModel, Field, IPvAnyAddress, MongoDsn, root_validator
1212

13-
from github_runner_manager.configuration import github
13+
from github_runner_manager.configuration import github, jobmanager
1414
from github_runner_manager.openstack_cloud.configuration import OpenStackConfiguration
1515

1616
logger = logging.getLogger(__name__)
@@ -40,6 +40,7 @@ class ApplicationConfiguration(BaseModel):
4040
Attributes:
4141
name: Name to identify the manager. Used for metrics.
4242
extra_labels: Extra labels to add to the runner.
43+
jobmanager_config: Configuration for the jobmanager platform.
4344
github_config: GitHub configuration.
4445
service_config: The configuration for supporting services.
4546
non_reactive_configuration: Configuration for non-reactive mode.
@@ -50,6 +51,7 @@ class ApplicationConfiguration(BaseModel):
5051

5152
name: str
5253
extra_labels: list[str]
54+
jobmanager_config: jobmanager.JobManagerConfiguration | None
5355
github_config: github.GitHubConfiguration | None
5456
service_config: "SupportServiceConfig"
5557
non_reactive_configuration: "NonReactiveConfiguration"
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Copyright 2025 Canonical Ltd.
2+
# See LICENSE file for licensing details.
3+
4+
"""Module containing JobManager Configuration."""
5+
from pydantic import BaseModel, HttpUrl
6+
7+
8+
class JobManagerConfiguration(BaseModel):
9+
"""JobManager configuration for the application.
10+
11+
Attributes:
12+
url: Base url of the job manager API.
13+
"""
14+
15+
url: HttpUrl

github-runner-manager/src/github_runner_manager/manager/runner_manager.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -508,13 +508,13 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceID:
508508
RunnerError: On error creating OpenStack runner.
509509
"""
510510
instance_id = InstanceID.build(args.cloud_runner_manager.name_prefix, args.reactive)
511-
runner_context, github_runner = args.platform_provider.get_runner_context(
511+
runner_context, runner_info = args.platform_provider.get_runner_context(
512512
instance_id=instance_id, metadata=args.metadata, labels=args.labels
513513
)
514514

515515
# Update the runner id if necessary
516516
if not args.metadata.runner_id:
517-
args.metadata.runner_id = str(github_runner.id)
517+
args.metadata.runner_id = str(runner_info.id)
518518

519519
runner_identity = RunnerIdentity(instance_id=instance_id, metadata=args.metadata)
520520
try:
@@ -533,7 +533,7 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceID:
533533

534534
except RunnerError:
535535
logger.warning("Deleting runner %s from platform after creation failed", instance_id)
536-
args.platform_provider.delete_runner(github_runner.identity)
536+
args.platform_provider.delete_runner(runner_info.identity)
537537
raise
538538
return instance_id
539539

github-runner-manager/src/github_runner_manager/manager/runner_scaler.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
OpenStackRunnerManager,
3434
OpenStackRunnerManagerConfig,
3535
)
36-
from github_runner_manager.platform.multiplexer_provider import MultiplexerPlatform
36+
from github_runner_manager.platform.multiplexer_provider import MultiplexerPlatform, Platform
3737
from github_runner_manager.platform.platform_provider import PlatformRunnerState
3838
from github_runner_manager.reactive.types_ import ReactiveProcessConfig
3939

@@ -141,6 +141,7 @@ def build( # pylint: disable-msg=too-many-locals
141141
platform_provider = MultiplexerPlatform.build(
142142
prefix=application_configuration.openstack_configuration.vm_prefix,
143143
github_configuration=application_configuration.github_config,
144+
jobmanager_configuration=application_configuration.jobmanager_config,
144145
)
145146

146147
runner_manager = RunnerManager(
@@ -163,6 +164,7 @@ def build( # pylint: disable-msg=too-many-locals
163164
queue=reactive_config.queue,
164165
manager_name=application_configuration.name,
165166
github_configuration=application_configuration.github_config,
167+
jobmanager_configuration=application_configuration.jobmanager_config,
166168
cloud_runner_manager=openstack_runner_manager_config,
167169
supported_labels=supported_labels,
168170
labels=labels,
@@ -174,6 +176,9 @@ def build( # pylint: disable-msg=too-many-locals
174176
user=user,
175177
base_quantity=base_quantity,
176178
max_quantity=max_quantity,
179+
platform_name=(
180+
Platform.GITHUB if application_configuration.github_config else Platform.JOBMANAGER
181+
),
177182
python_path=python_path,
178183
)
179184

@@ -190,6 +195,7 @@ def __init__( # pylint: disable=too-many-arguments, too-many-positional-argumen
190195
base_quantity: int,
191196
max_quantity: int,
192197
python_path: str | None = None,
198+
platform_name: Platform = Platform.GITHUB,
193199
):
194200
"""Construct the object.
195201
@@ -199,13 +205,15 @@ def __init__( # pylint: disable=too-many-arguments, too-many-positional-argumen
199205
user: The user to run the reactive process.
200206
base_quantity: The number of intended non-reactive runners.
201207
max_quantity: The number of maximum runners for reactive.
208+
platform_name: The name of the platform used for spawning runners.
202209
python_path: The PYTHONPATH to access the github-runner-manager library.
203210
"""
204211
self._manager = runner_manager
205212
self._reactive_config = reactive_process_config
206213
self._user = user
207214
self._base_quantity = base_quantity
208215
self._max_quantity = max_quantity
216+
self._platform_name = platform_name
209217
self._python_path = python_path
210218

211219
def get_runner_info(self) -> RunnerInfo:
@@ -335,7 +343,9 @@ def _reconcile_non_reactive(self, expected_quantity: int) -> _ReconcileResult:
335343
runner_diff = expected_quantity - len(runners)
336344
if runner_diff > 0:
337345
try:
338-
self._manager.create_runners(num=runner_diff, metadata=RunnerMetadata())
346+
self._manager.create_runners(
347+
num=runner_diff, metadata=RunnerMetadata(platform_name=self._platform_name)
348+
)
339349
except MissingServerConfigError:
340350
logging.exception(
341351
"Unable to spawn runner due to missing server configuration, "

github-runner-manager/src/github_runner_manager/platform/jobmanager_provider.py

Lines changed: 72 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
from enum import Enum
88

99
import jobmanager_client
10-
from jobmanager_client.models.v1_jobs_job_id_token_post_request import V1JobsJobIdTokenPostRequest
1110
from jobmanager_client.rest import ApiException, NotFoundException
1211
from pydantic import HttpUrl
1312
from pydantic.error_wrappers import ValidationError
1413
from urllib3.exceptions import RequestError
1514

15+
from github_runner_manager.configuration.jobmanager import JobManagerConfiguration
1616
from github_runner_manager.manager.models import (
1717
InstanceID,
1818
RunnerContext,
@@ -38,14 +38,25 @@
3838
class JobManagerPlatform(PlatformProvider):
3939
"""Manage self-hosted runner on the JobManager."""
4040

41+
def __init__(self, url: str):
42+
"""Construct the object.
43+
44+
Args:
45+
url: The jobmanager base URL.
46+
"""
47+
self._url = url
48+
4149
@classmethod
42-
def build(cls) -> "JobManagerPlatform":
50+
def build(cls, jobmanager_configuration: JobManagerConfiguration) -> "JobManagerPlatform":
4351
"""Build a new instance of the JobManagerPlatform.
4452
53+
Args:
54+
jobmanager_configuration: Configuration for the jobmanager.
55+
4556
Returns:
4657
New JobManagerPlatform.
4758
"""
48-
return cls()
59+
return cls(url=jobmanager_configuration.url)
4960

5061
def get_runner_health(
5162
self,
@@ -62,11 +73,11 @@ def get_runner_health(
6273
Returns:
6374
The health of the runner in the jobmanager.
6475
"""
65-
configuration = jobmanager_client.Configuration(host=runner_identity.metadata.url)
76+
configuration = jobmanager_client.Configuration(host=self._url)
6677
with jobmanager_client.ApiClient(configuration) as api_client:
67-
api_instance = jobmanager_client.DefaultApi(api_client)
78+
api_instance = jobmanager_client.RunnersApi(api_client)
6879
try:
69-
response = api_instance.v1_jobs_job_id_health_get(
80+
response = api_instance.get_runner_health_v1_runner_runner_id_health_get(
7081
int(runner_identity.metadata.runner_id)
7182
)
7283
except NotFoundException:
@@ -132,14 +143,14 @@ def get_runners_health(self, requested_runners: list[RunnerIdentity]) -> Runners
132143
)
133144

134145
def delete_runner(self, runner_identity: RunnerIdentity) -> None:
135-
"""Delete a runner from jobmanager..
146+
"""Delete a runner from jobmanager.
136147
137148
This method does nothing, as the jobmanager does not implement it.
138149
139150
Args:
140151
runner_identity: The identity of the runner to delete.
141152
"""
142-
logger.debug("No need to delete jobs in the jobmanager.")
153+
logger.debug("No need to delete runners in the jobmanager.")
143154

144155
def get_runner_context(
145156
self, metadata: RunnerMetadata, instance_id: InstanceID, labels: list[str]
@@ -159,17 +170,29 @@ def get_runner_context(
159170
Returns:
160171
New runner token.
161172
"""
162-
configuration = jobmanager_client.Configuration(host=metadata.url)
173+
configuration = jobmanager_client.Configuration(host=self._url)
163174
with jobmanager_client.ApiClient(configuration) as api_client:
164-
api_instance = jobmanager_client.DefaultApi(api_client)
175+
api_instance = jobmanager_client.RunnersApi(api_client)
165176
try:
166-
# Retrieve jobs
167-
jobrequest = V1JobsJobIdTokenPostRequest(job_id=int(metadata.runner_id))
168-
response = api_instance.v1_jobs_job_id_token_post(
169-
int(metadata.runner_id), jobrequest
177+
runner_register_request = (
178+
jobmanager_client.RegisterRunnerV1RunnerRegisterPostRequest(
179+
name=instance_id.name, labels=labels
180+
)
181+
)
182+
183+
response = api_instance.register_runner_v1_runner_register_post(
184+
runner_register_request
170185
)
186+
if not response.id:
187+
raise PlatformApiError("No runner ID from jobmanager API")
188+
updated_metadata = RunnerMetadata(
189+
platform_name=metadata.platform_name, url=self._url
190+
)
191+
updated_metadata.runner_id = str(response.id)
171192
if token := response.token:
172-
jobmanager_endpoint = f"{metadata.url}/v1/jobs/{metadata.runner_id}/health"
193+
jobmanager_endpoint = (
194+
f"{self._url}/v1/runner/{updated_metadata.runner_id}/health"
195+
)
173196
# For now, use the first label
174197
label = "undefined"
175198
if labels:
@@ -190,7 +213,7 @@ def get_runner_context(
190213
metadata=metadata,
191214
),
192215
busy=False,
193-
id=int(metadata.runner_id),
216+
id=int(updated_metadata.runner_id),
194217
labels=[SelfHostedRunnerLabel(name=label) for label in labels],
195218
status=GitHubRunnerStatus.OFFLINE,
196219
),
@@ -209,17 +232,46 @@ def check_job_been_picked_up(self, metadata: RunnerMetadata, job_url: HttpUrl) -
209232
210233
Raises:
211234
PlatformApiError: Problem with the underlying client.
235+
ValueError: Raised when the job_url is malformed.
212236
213237
Returns:
214238
True if the job has been picked up, False otherwise.
215239
"""
216-
configuration = jobmanager_client.Configuration(host=metadata.url)
240+
configuration = jobmanager_client.Configuration(host=self._url)
241+
242+
# job_url has the path:
243+
# "/v1/jobs/<job_id>"
244+
job_path_prefix = "/v1/jobs/"
245+
246+
path = job_url.path
247+
if not (path and path.startswith(job_path_prefix)):
248+
logger.error(
249+
"Job URL path does not start with '%s'. Received %s", job_path_prefix, path
250+
)
251+
raise ValueError(f'Job URL path does not start with "{job_path_prefix}"')
252+
try:
253+
job_id = int(path[len(job_path_prefix) :]) # Extract job_id from the path
254+
except ValueError as exc:
255+
logger.error(
256+
"Job URL path %s does not contain a valid job_id after '%s'",
257+
path,
258+
job_path_prefix,
259+
)
260+
raise ValueError(
261+
f"Job URL path does not contain a valid job_id after '{job_path_prefix}'"
262+
) from exc
263+
logging.debug(
264+
"Parsed job_id: %s from job_url path %s",
265+
job_id,
266+
path,
267+
)
217268

218269
with jobmanager_client.ApiClient(configuration) as api_client:
219-
api_instance = jobmanager_client.DefaultApi(api_client)
270+
api_instance = jobmanager_client.JobsApi(api_client)
220271
try:
221-
job = api_instance.v1_jobs_job_id_get(int(metadata.runner_id))
222-
if job.status != JobStatus.PENDING:
272+
job = api_instance.get_job_v1_jobs_job_id_get(job_id)
273+
# the api returns a generic object, ignore the type for status
274+
if job.status != JobStatus.PENDING: # type: ignore
223275
return True
224276
except (ApiException, RequestError, ValidationError) as exc:
225277
logger.exception("Error calling jobmanager api to get job information.")

0 commit comments

Comments
 (0)