2525from aiida .common .log import LOG_LEVEL_REPORT
2626from aiida .engine import Process , ProcessState
2727from aiida .engine .processes import control as process_control
28+ from aiida .engine .utils import exponential_backoff_retry
2829from aiida .orm import CalcJobNode , Group , WorkChainNode , WorkflowNode , WorkFunctionNode
2930from tests .utils .processes import WaitProcess
3031
@@ -53,6 +54,7 @@ def start_daemon_worker_in_foreground_and_redirect_streams(
5354
5455 try :
5556 pid = os .getpid ()
57+ # For easier debugging you can change these to stdout
5658 sys .stdout = open (log_dir / f'worker-{ pid } .out' , 'w' )
5759 sys .stderr = open (log_dir / f'worker-{ pid } .err' , 'w' )
5860 start_daemon_worker (False , aiida_profile_name )
@@ -72,10 +74,22 @@ def mock_open(_):
7274 raise Exception ('Mock open exception' )
7375
7476 @staticmethod
75- async def mock_exponential_backoff_retry ( * _ , ** __ ):
77+ async def exponential_backoff_retry_fail_upload ( fct : t . Callable [..., t . Any ], * args , ** kwargs ):
7678 from aiida .common .exceptions import TransportTaskException
7779
78- raise TransportTaskException
80+ if 'do_upload' in fct .__name__ :
81+ raise TransportTaskException
82+ else :
83+ return await exponential_backoff_retry (fct , * args , ** kwargs )
84+
85+ @staticmethod
86+ async def exponential_backoff_retry_fail_kill (fct : t .Callable [..., t .Any ], * args , ** kwargs ):
87+ from aiida .common .exceptions import TransportTaskException
88+
89+ if 'do_kill' in fct .__name__ :
90+ raise TransportTaskException
91+ else :
92+ return await exponential_backoff_retry (fct , * args , ** kwargs )
7993
8094
8195@pytest .fixture (scope = 'function' )
@@ -213,11 +227,12 @@ def make_a_builder(sleep_seconds=0):
213227
214228@pytest .mark .requires_rmq
215229@pytest .mark .usefixtures ('started_daemon_client' )
216- def test_process_kill_failng_ebm (
230+ def test_process_kill_failing_ebm_upload (
217231 fork_worker_context , submit_and_await , aiida_code_installed , run_cli_command , monkeypatch
218232):
219- """9) Kill a process that is paused after EBM (5 times failed). It should be possible to kill it normally.
220- # (e.g. in scenarios that transport is working again)
233+ """Kill a process that is waiting after failed EBM during upload. It should be possible to kill it normally.
234+
235+ A process that failed upload (e.g. in scenarios that transport is working again) and is then killed with
221236 """
222237 from aiida .orm import Int
223238
@@ -232,7 +247,10 @@ def make_a_builder(sleep_seconds=0):
232247
233248 kill_timeout = 10
234249
235- monkeypatch_args = ('aiida.engine.utils.exponential_backoff_retry' , MockFunctions .mock_exponential_backoff_retry )
250+ monkeypatch_args = (
251+ 'aiida.engine.utils.exponential_backoff_retry' ,
252+ MockFunctions .exponential_backoff_retry_fail_upload ,
253+ )
236254 with fork_worker_context (monkeypatch .setattr , monkeypatch_args ):
237255 node = submit_and_await (make_a_builder (), ProcessState .WAITING )
238256 await_condition (
@@ -241,10 +259,60 @@ def make_a_builder(sleep_seconds=0):
241259 timeout = kill_timeout ,
242260 )
243261
262+ # kill should start EBM and should successfully kill
244263 run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
245264 await_condition (lambda : node .is_killed , timeout = kill_timeout )
246265
247266
267+ @pytest .mark .requires_rmq
268+ @pytest .mark .usefixtures ('started_daemon_client' )
269+ def test_process_kill_failing_ebm_kill (
270+ fork_worker_context , submit_and_await , aiida_code_installed , run_cli_command , monkeypatch
271+ ):
272+ """Kill a process that with a failng EBM during the kill.
273+
274+ Killing a process tries to gracefully cancel the job on the remote node. If there are connection problems it retries
275+ it in using the EBM. If this fails another kill command can be send to restart the cancelation of the job scheduler.
276+ """
277+ from aiida .orm import Int
278+
279+ code = aiida_code_installed (default_calc_job_plugin = 'core.arithmetic.add' , filepath_executable = '/bin/bash' )
280+
281+ def make_a_builder (sleep_seconds = 0 ):
282+ builder = code .get_builder ()
283+ builder .x = Int (1 )
284+ builder .y = Int (1 )
285+ builder .metadata .options .sleep = sleep_seconds
286+ return builder
287+
288+ kill_timeout = 10
289+
290+ monkeypatch_args = (
291+ 'aiida.engine.utils.exponential_backoff_retry' ,
292+ MockFunctions .exponential_backoff_retry_fail_kill ,
293+ )
294+ # from aiida.engine.utils import exponential_backoff_retry
295+ # monkeypatch_args = ('aiida.engine.utils.exponential_backoff_retry', exponential_backoff_retry)
296+ with fork_worker_context (monkeypatch .setattr , monkeypatch_args ):
297+ node = submit_and_await (make_a_builder (kill_timeout + 10 ), ProcessState .WAITING , timeout = kill_timeout )
298+ await_condition (
299+ lambda : node .process_status == 'Monitoring scheduler: job state RUNNING' ,
300+ timeout = kill_timeout ,
301+ )
302+
303+ # kill should start EBM and be not successful in EBM
304+ run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
305+ await_condition (lambda : not node .is_killed , timeout = kill_timeout )
306+
307+ # kill should restart EBM and be not successful in EBM
308+ run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
309+ await_condition (lambda : not node .is_killed , timeout = kill_timeout )
310+
311+ # force kill should skip EBM and successfully kill the process
312+ run_cli_command (cmd_process .process_kill , [str (node .pk ), '-F' , '--wait' ])
313+ await_condition (lambda : node .is_killed , timeout = kill_timeout )
314+
315+
248316class TestVerdiProcess :
249317 """Tests for `verdi process`."""
250318
0 commit comments