Skip to content

Commit d0947c6

Browse files
authored
[ISD-3570] Flush runner will kill reactive processes as well (#595)
1 parent 34cd855 commit d0947c6

File tree

10 files changed

+92
-8
lines changed

10 files changed

+92
-8
lines changed

docs/changelog.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22

33
This changelog documents user-relevant changes to the GitHub runner charm.
44

5-
## 2025-07-15
5+
## 2025-07-18
6+
7+
- Fix an issue where flushing runners does not include reactive processes. This cause some reactive runner to spawn with old code after upgrade.
8+
9+
## 2025-07-16
610

711
- Fix the incorrect default value of the aproxy-exclude-addresses configuration.
812

@@ -12,6 +16,7 @@ This changelog documents user-relevant changes to the GitHub runner charm.
1216
due to a bug on the OpenStack side: https://bugs.launchpad.net/nova/+bug/2095364
1317

1418
### 2025-06-30
19+
1520
- New configuration options aproxy-exclude-addresses and aproxy-redirect-ports for allowing aproxy to redirect arbitrary TCP traffic
1621
- Added prometheus metrics to the GitHub runner manager application.
1722

github-runner-manager/src/github_runner_manager/manager/runner_scaler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,8 @@ def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> int:
267267
Number of runners flushed.
268268
"""
269269
metric_stats = self._manager.cleanup()
270+
if self._reactive_config is not None:
271+
reactive_runner_manager.flush_reactive_processes()
270272
delete_metric_stats = self._manager.flush_runners(flush_mode=flush_mode)
271273
events = set(delete_metric_stats.keys()) | set(metric_stats.keys())
272274
metric_stats = {

github-runner-manager/src/github_runner_manager/reactive/consumer.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727

2828
Labels = set[str]
2929

30+
PROCESS_COUNT_HEADER_NAME = "X-Process-Count"
31+
WAIT_TIME_IN_SEC = 60
3032
# This control message is for testing. The reactive process will stop consuming messages
3133
# when the message is sent. This message does not come from the router.
3234
END_PROCESSING_PAYLOAD = "__END__"
@@ -125,6 +127,14 @@ def consume( # noqa: C901
125127
if msg.payload == END_PROCESSING_PAYLOAD:
126128
msg.ack()
127129
break
130+
131+
msg.headers[PROCESS_COUNT_HEADER_NAME] = (
132+
msg.headers.get(PROCESS_COUNT_HEADER_NAME, 0) + 1
133+
)
134+
# Avoid rapid retrying to prevent overloading services, e.g., OpenStack API.
135+
if msg.headers[PROCESS_COUNT_HEADER_NAME] > 1:
136+
sleep(WAIT_TIME_IN_SEC)
137+
128138
job_details = _parse_job_details(msg)
129139
logger.info("Received reactive job: %s", job_details)
130140
if not _validate_labels(
@@ -248,7 +258,7 @@ def _spawn_runner(
248258
logger.info("Reactive runner spawned %s", instance_ids)
249259

250260
for _ in range(5):
251-
sleep(60)
261+
sleep(WAIT_TIME_IN_SEC)
252262
logger.info("Checking if job picked up for reactive runner %s", instance_ids)
253263
if platform_provider.check_job_been_picked_up(metadata=metadata, job_url=job_url):
254264
logger.info("Job picked %s. reactive runner ok %s", job_url, instance_ids)

github-runner-manager/src/github_runner_manager/reactive/process_manager.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,23 @@ def reconcile(
9090
return delta
9191

9292

93+
def kill_reactive_processes() -> None:
94+
"""Kill all reactive processes."""
95+
pids = _get_pids()
96+
if pids:
97+
for pid in pids:
98+
try:
99+
logger.info("Killing reactive runner process with pid %s", pid)
100+
os.kill(pid, signal.SIGTERM)
101+
except ProcessLookupError:
102+
logger.info(
103+
"Failed to kill process with pid %s. Process might have terminated it self.",
104+
pid,
105+
)
106+
else:
107+
logger.info("No reactive processes to flush")
108+
109+
93110
def _get_pids() -> list[int]:
94111
"""Get the PIDs of the reactive runners processes.
95112

github-runner-manager/src/github_runner_manager/reactive/runner_manager.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,8 @@ def reconcile(
128128
)
129129

130130
return ReconcileResult(processes_diff=processes_created, metric_stats=metric_stats)
131+
132+
133+
def flush_reactive_processes() -> None:
134+
"""Flush all the reactive processes."""
135+
process_manager.kill_reactive_processes()

github-runner-manager/tests/unit/reactive/test_process_manager.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
PYTHON_BIN,
1616
REACTIVE_RUNNER_SCRIPT_MODULE,
1717
ReactiveRunnerError,
18+
kill_reactive_processes,
1819
reconcile,
1920
)
2021
from github_runner_manager.reactive.types_ import QueueConfig, ReactiveProcessConfig
@@ -196,3 +197,17 @@ def _arrange_reactive_processes(secure_run_subprocess_mock: MagicMock, count: in
196197
stdout=f"CMD\n{process_cmds_before}".encode("utf-8"),
197198
stderr=b"",
198199
)
200+
201+
202+
def test_reactive_flush(
203+
os_kill_mock: MagicMock,
204+
secure_run_subprocess_mock: MagicMock,
205+
):
206+
"""
207+
arrange: Mock 3 reactive processes.
208+
act: Run flush for reactive.
209+
assert: Find 3 os.kill calls.
210+
"""
211+
_arrange_reactive_processes(secure_run_subprocess_mock, count=3)
212+
kill_reactive_processes()
213+
assert os_kill_mock.call_count == 3

github-runner-manager/tests/unit/test_runner_scaler.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,17 @@ def application_configuration_fixture() -> ApplicationConfiguration:
197197
)
198198

199199

200+
@pytest.fixture(scope="function", name="runner_scaler_reactive")
201+
def runner_scaler_reactive_fixture(
202+
application_configuration: ApplicationConfiguration,
203+
runner_manager: RunnerManager,
204+
user_info: UserInfo,
205+
) -> RunnerScaler:
206+
runner_scaler = RunnerScaler.build(application_configuration, user_info)
207+
runner_scaler._manager = runner_manager
208+
return runner_scaler
209+
210+
200211
@pytest.fixture(scope="function", name="runner_scaler_one_runner")
201212
def runner_scaler_one_runner_fixture(
202213
runner_manager: RunnerManager, user_info: UserInfo

src/charm.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,9 @@ def _on_upgrade_charm(self, _: UpgradeCharmEvent) -> None:
319319
logger.info(UPGRADE_MSG)
320320
self._common_install_code()
321321
_disable_legacy_service()
322+
state = self._setup_state()
323+
self._setup_service(state)
324+
self._manager_client.flush_runner()
322325

323326
@catch_charm_errors
324327
def _on_config_changed(self, _: ConfigChangedEvent) -> None:

tests/integration/jobmanager/test_jobmanager_reactive.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@
4040
logger = logging.getLogger(__name__)
4141
pytestmark = pytest.mark.openstack
4242

43+
# This is tied to the wait time in reactive processes.
44+
REACTIVE_WAIT_TIME = 60
45+
# Set to a higher reconcile time, due to flush on empty queue in reconcile can affect the tests.
46+
RECONCILE_INTERVAL = 15
47+
4348

4449
@pytest_asyncio.fixture(name="app_for_reactive")
4550
async def app_for_reactive_fixture(
@@ -70,9 +75,7 @@ async def app_for_reactive_fixture(
7075
{
7176
BASE_VIRTUAL_MACHINES_CONFIG_NAME: "0",
7277
MAX_TOTAL_VIRTUAL_MACHINES_CONFIG_NAME: "1",
73-
# set larger recon interval as the default due to race condition
74-
# killing reactive process
75-
RECONCILE_INTERVAL_CONFIG_NAME: "5",
78+
RECONCILE_INTERVAL_CONFIG_NAME: str(RECONCILE_INTERVAL),
7679
}
7780
)
7881
await wait_for_reconcile(app_for_reactive)
@@ -199,7 +202,9 @@ async def _prepare_runner() -> bool:
199202
)
200203
job_get_handler.respond_with_json(returned_job.to_dict())
201204

202-
with httpserver.wait(raise_assertions=True, stop_on_nohandler=False, timeout=30) as waiting:
205+
with httpserver.wait(
206+
raise_assertions=True, stop_on_nohandler=False, timeout=REACTIVE_WAIT_TIME + 30
207+
) as waiting:
203208
logger.info("Waiting for job status to be queried.")
204209
logger.info("server log: %s ", (httpserver.log))
205210
assert waiting.result, "Failed waiting for job status to be queried."
@@ -211,7 +216,7 @@ async def _prepare_runner() -> bool:
211216
# TMP: hack to trigger reconcile by changing the configuration, which cause config_changed hook
212217
# to restart the reconcile service.
213218
await app_for_reactive.set_config(
214-
{RECONCILE_INTERVAL_CONFIG_NAME: str(DEFAULT_RECONCILE_INTERVAL + 1)}
219+
{RECONCILE_INTERVAL_CONFIG_NAME: str(RECONCILE_INTERVAL + 1)}
215220
)
216221
await wait_for_reconcile(app_for_reactive)
217222

tests/unit/test_charm.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,10 @@ def test_common_install_code(
222222
"""
223223
state_mock = MagicMock()
224224
harness.charm._setup_state = MagicMock(return_value=state_mock)
225-
225+
manager_client_mock = MagicMock(spec=GitHubRunnerManagerClient)
226+
harness.charm._manager_client = manager_client_mock
227+
mock_manager_service = MagicMock()
228+
monkeypatch.setattr("charm.manager_service", mock_manager_service)
226229
monkeypatch.setattr("charm.logrotate.setup", setup_logrotate := MagicMock())
227230
monkeypatch.setattr("charm.systemd", MagicMock())
228231

@@ -630,6 +633,10 @@ def test_metric_log_ownership_for_upgrade(
630633

631634
mock_metric_log_path = tmp_path
632635
mock_metric_log_path.touch(exist_ok=True)
636+
manager_client_mock = MagicMock(spec=GitHubRunnerManagerClient)
637+
harness.charm._manager_client = manager_client_mock
638+
mock_manager_service = MagicMock()
639+
monkeypatch.setattr("charm.manager_service", mock_manager_service)
633640
monkeypatch.setattr("charm.METRICS_LOG_PATH", mock_metric_log_path)
634641
monkeypatch.setattr("charm.shutil", shutil_mock := MagicMock())
635642
monkeypatch.setattr("charm.execute_command", MagicMock(return_value=(0, "Mock_stdout")))
@@ -652,6 +659,10 @@ def test_attempting_disable_legacy_service_for_upgrade(
652659
assert: Calls to stop the legacy service is performed.
653660
"""
654661
harness.charm._setup_state = MagicMock()
662+
manager_client_mock = MagicMock(spec=GitHubRunnerManagerClient)
663+
harness.charm._manager_client = manager_client_mock
664+
mock_manager_service = MagicMock()
665+
monkeypatch.setattr("charm.manager_service", mock_manager_service)
655666
monkeypatch.setattr("charm.systemd", mock_systemd := MagicMock())
656667
monkeypatch.setattr("charm.execute_command", MagicMock(return_value=(0, "Mock_stdout")))
657668
monkeypatch.setattr("charm.pathlib", MagicMock())

0 commit comments

Comments
 (0)