[ISD-3570] Flush runner will kill reactive processes as well (#595)

yhaliaw · web-flow · commit d0947c688de8 · 2025-07-21T15:33:15.000+08:00
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -2,7 +2,11 @@
 
 This changelog documents user-relevant changes to the GitHub runner charm.
 
-## 2025-07-15
+## 2025-07-18
+
+- Fix an issue where flushing runners does not include reactive processes. This cause some reactive runner to spawn with old code after upgrade.
+
+## 2025-07-16
 
 - Fix the incorrect default value of the aproxy-exclude-addresses configuration.
 
@@ -12,6 +16,7 @@ This changelog documents user-relevant changes to the GitHub runner charm.
  due to a bug on the OpenStack side: https://bugs.launchpad.net/nova/+bug/2095364
 
 ### 2025-06-30
+
 - New configuration options aproxy-exclude-addresses and aproxy-redirect-ports for allowing aproxy to redirect arbitrary TCP traffic
 - Added prometheus metrics to the GitHub runner manager application.
 
diff --git a/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py b/github-runner-manager/src/github_runner_manager/manager/runner_scaler.py
@@ -267,6 +267,8 @@ def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> int:
             Number of runners flushed.
         """
         metric_stats = self._manager.cleanup()
+        if self._reactive_config is not None:
+            reactive_runner_manager.flush_reactive_processes()
         delete_metric_stats = self._manager.flush_runners(flush_mode=flush_mode)
         events = set(delete_metric_stats.keys()) | set(metric_stats.keys())
         metric_stats = {
diff --git a/github-runner-manager/src/github_runner_manager/reactive/consumer.py b/github-runner-manager/src/github_runner_manager/reactive/consumer.py
@@ -27,6 +27,8 @@
 
 Labels = set[str]
 
+PROCESS_COUNT_HEADER_NAME = "X-Process-Count"
+WAIT_TIME_IN_SEC = 60
 # This control message is for testing. The reactive process will stop consuming messages
 # when the message is sent. This message does not come from the router.
 END_PROCESSING_PAYLOAD = "__END__"
@@ -125,6 +127,14 @@ def consume(  # noqa: C901
                 if msg.payload == END_PROCESSING_PAYLOAD:
                     msg.ack()
                     break
+
+                msg.headers[PROCESS_COUNT_HEADER_NAME] = (
+                    msg.headers.get(PROCESS_COUNT_HEADER_NAME, 0) + 1
+                )
+                # Avoid rapid retrying to prevent overloading services, e.g., OpenStack API.
+                if msg.headers[PROCESS_COUNT_HEADER_NAME] > 1:
+                    sleep(WAIT_TIME_IN_SEC)
+
                 job_details = _parse_job_details(msg)
                 logger.info("Received reactive job: %s", job_details)
                 if not _validate_labels(
@@ -248,7 +258,7 @@ def _spawn_runner(
     logger.info("Reactive runner spawned %s", instance_ids)
 
     for _ in range(5):
-        sleep(60)
+        sleep(WAIT_TIME_IN_SEC)
         logger.info("Checking if job picked up for reactive runner %s", instance_ids)
         if platform_provider.check_job_been_picked_up(metadata=metadata, job_url=job_url):
             logger.info("Job picked %s. reactive runner ok %s", job_url, instance_ids)
diff --git a/github-runner-manager/src/github_runner_manager/reactive/process_manager.py b/github-runner-manager/src/github_runner_manager/reactive/process_manager.py
@@ -90,6 +90,23 @@ def reconcile(
     return delta
 
 
+def kill_reactive_processes() -> None:
+    """Kill all reactive processes."""
+    pids = _get_pids()
+    if pids:
+        for pid in pids:
+            try:
+                logger.info("Killing reactive runner process with pid %s", pid)
+                os.kill(pid, signal.SIGTERM)
+            except ProcessLookupError:
+                logger.info(
+                    "Failed to kill process with pid %s. Process might have terminated it self.",
+                    pid,
+                )
+    else:
+        logger.info("No reactive processes to flush")
+
+
 def _get_pids() -> list[int]:
     """Get the PIDs of the reactive runners processes.
 
diff --git a/github-runner-manager/src/github_runner_manager/reactive/runner_manager.py b/github-runner-manager/src/github_runner_manager/reactive/runner_manager.py
@@ -128,3 +128,8 @@ def reconcile(
     )
 
     return ReconcileResult(processes_diff=processes_created, metric_stats=metric_stats)
+
+
+def flush_reactive_processes() -> None:
+    """Flush all the reactive processes."""
+    process_manager.kill_reactive_processes()
diff --git a/github-runner-manager/tests/unit/reactive/test_process_manager.py b/github-runner-manager/tests/unit/reactive/test_process_manager.py
@@ -15,6 +15,7 @@
     PYTHON_BIN,
     REACTIVE_RUNNER_SCRIPT_MODULE,
     ReactiveRunnerError,
+    kill_reactive_processes,
     reconcile,
 )
 from github_runner_manager.reactive.types_ import QueueConfig, ReactiveProcessConfig
@@ -196,3 +197,17 @@ def _arrange_reactive_processes(secure_run_subprocess_mock: MagicMock, count: in
         stdout=f"CMD\n{process_cmds_before}".encode("utf-8"),
         stderr=b"",
     )
+
+
+def test_reactive_flush(
+    os_kill_mock: MagicMock,
+    secure_run_subprocess_mock: MagicMock,
+):
+    """
+    arrange: Mock 3 reactive processes.
+    act: Run flush for reactive.
+    assert: Find 3 os.kill calls.
+    """
+    _arrange_reactive_processes(secure_run_subprocess_mock, count=3)
+    kill_reactive_processes()
+    assert os_kill_mock.call_count == 3
diff --git a/github-runner-manager/tests/unit/test_runner_scaler.py b/github-runner-manager/tests/unit/test_runner_scaler.py
@@ -197,6 +197,17 @@ def application_configuration_fixture() -> ApplicationConfiguration:
     )
 
 
+@pytest.fixture(scope="function", name="runner_scaler_reactive")
+def runner_scaler_reactive_fixture(
+    application_configuration: ApplicationConfiguration,
+    runner_manager: RunnerManager,
+    user_info: UserInfo,
+) -> RunnerScaler:
+    runner_scaler = RunnerScaler.build(application_configuration, user_info)
+    runner_scaler._manager = runner_manager
+    return runner_scaler
+
+
 @pytest.fixture(scope="function", name="runner_scaler_one_runner")
 def runner_scaler_one_runner_fixture(
     runner_manager: RunnerManager, user_info: UserInfo
diff --git a/src/charm.py b/src/charm.py
@@ -319,6 +319,9 @@ def _on_upgrade_charm(self, _: UpgradeCharmEvent) -> None:
         logger.info(UPGRADE_MSG)
         self._common_install_code()
         _disable_legacy_service()
+        state = self._setup_state()
+        self._setup_service(state)
+        self._manager_client.flush_runner()
 
     @catch_charm_errors
     def _on_config_changed(self, _: ConfigChangedEvent) -> None:
diff --git a/tests/integration/jobmanager/test_jobmanager_reactive.py b/tests/integration/jobmanager/test_jobmanager_reactive.py
@@ -40,6 +40,11 @@
 logger = logging.getLogger(__name__)
 pytestmark = pytest.mark.openstack
 
+# This is tied to the wait time in reactive processes.
+REACTIVE_WAIT_TIME = 60
+# Set to a higher reconcile time, due to flush on empty queue in reconcile can affect the tests.
+RECONCILE_INTERVAL = 15
+
 
 @pytest_asyncio.fixture(name="app_for_reactive")
 async def app_for_reactive_fixture(
@@ -70,9 +75,7 @@ async def app_for_reactive_fixture(
         {
             BASE_VIRTUAL_MACHINES_CONFIG_NAME: "0",
             MAX_TOTAL_VIRTUAL_MACHINES_CONFIG_NAME: "1",
-            # set larger recon interval as the default due to race condition
-            # killing reactive process
-            RECONCILE_INTERVAL_CONFIG_NAME: "5",
+            RECONCILE_INTERVAL_CONFIG_NAME: str(RECONCILE_INTERVAL),
         }
     )
     await wait_for_reconcile(app_for_reactive)
@@ -199,7 +202,9 @@ async def _prepare_runner() -> bool:
     )
     job_get_handler.respond_with_json(returned_job.to_dict())
 
-    with httpserver.wait(raise_assertions=True, stop_on_nohandler=False, timeout=30) as waiting:
+    with httpserver.wait(
+        raise_assertions=True, stop_on_nohandler=False, timeout=REACTIVE_WAIT_TIME + 30
+    ) as waiting:
         logger.info("Waiting for job status to be queried.")
     logger.info("server log: %s ", (httpserver.log))
     assert waiting.result, "Failed waiting for job status to be queried."
@@ -211,7 +216,7 @@ async def _prepare_runner() -> bool:
     # TMP: hack to trigger reconcile by changing the configuration, which cause config_changed hook
     # to restart the reconcile service.
     await app_for_reactive.set_config(
-        {RECONCILE_INTERVAL_CONFIG_NAME: str(DEFAULT_RECONCILE_INTERVAL + 1)}
+        {RECONCILE_INTERVAL_CONFIG_NAME: str(RECONCILE_INTERVAL + 1)}
     )
     await wait_for_reconcile(app_for_reactive)
 
diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py
@@ -222,7 +222,10 @@ def test_common_install_code(
     """
     state_mock = MagicMock()
     harness.charm._setup_state = MagicMock(return_value=state_mock)
-
+    manager_client_mock = MagicMock(spec=GitHubRunnerManagerClient)
+    harness.charm._manager_client = manager_client_mock
+    mock_manager_service = MagicMock()
+    monkeypatch.setattr("charm.manager_service", mock_manager_service)
     monkeypatch.setattr("charm.logrotate.setup", setup_logrotate := MagicMock())
     monkeypatch.setattr("charm.systemd", MagicMock())
 
@@ -630,6 +633,10 @@ def test_metric_log_ownership_for_upgrade(
 
     mock_metric_log_path = tmp_path
     mock_metric_log_path.touch(exist_ok=True)
+    manager_client_mock = MagicMock(spec=GitHubRunnerManagerClient)
+    harness.charm._manager_client = manager_client_mock
+    mock_manager_service = MagicMock()
+    monkeypatch.setattr("charm.manager_service", mock_manager_service)
     monkeypatch.setattr("charm.METRICS_LOG_PATH", mock_metric_log_path)
     monkeypatch.setattr("charm.shutil", shutil_mock := MagicMock())
     monkeypatch.setattr("charm.execute_command", MagicMock(return_value=(0, "Mock_stdout")))
@@ -652,6 +659,10 @@ def test_attempting_disable_legacy_service_for_upgrade(
     assert: Calls to stop the legacy service is performed.
     """
     harness.charm._setup_state = MagicMock()
+    manager_client_mock = MagicMock(spec=GitHubRunnerManagerClient)
+    harness.charm._manager_client = manager_client_mock
+    mock_manager_service = MagicMock()
+    monkeypatch.setattr("charm.manager_service", mock_manager_service)
     monkeypatch.setattr("charm.systemd", mock_systemd := MagicMock())
     monkeypatch.setattr("charm.execute_command", MagicMock(return_value=(0, "Mock_stdout")))
     monkeypatch.setattr("charm.pathlib", MagicMock())

Original file line number	Diff line number	Diff line change
`@@ -128,3 +128,8 @@ def reconcile(`
`128`	`128`	`)`
`129`	`129`
`130`	`130`	`return ReconcileResult(processes_diff=processes_created, metric_stats=metric_stats)`
	`131`	`+`
	`132`	`+`
	`133`	`+def flush_reactive_processes() -> None:`
	`134`	`+ """Flush all the reactive processes."""`
	`135`	`+ process_manager.kill_reactive_processes()`