2525from  aiida .common .log  import  LOG_LEVEL_REPORT 
2626from  aiida .engine  import  Process , ProcessState 
2727from  aiida .engine .processes  import  control  as  process_control 
28+ from  aiida .engine .utils  import  exponential_backoff_retry 
2829from  aiida .orm  import  CalcJobNode , Group , WorkChainNode , WorkflowNode , WorkFunctionNode 
2930from  tests .utils .processes  import  WaitProcess 
3031
@@ -53,6 +54,7 @@ def start_daemon_worker_in_foreground_and_redirect_streams(
5354
5455    try :
5556        pid  =  os .getpid ()
57+         # For easier debugging you can change these to stdout 
5658        sys .stdout  =  open (log_dir  /  f'worker-{ pid }  , 'w' )
5759        sys .stderr  =  open (log_dir  /  f'worker-{ pid }  , 'w' )
5860        start_daemon_worker (False , aiida_profile_name )
@@ -72,10 +74,22 @@ def mock_open(_):
7274        raise  Exception ('Mock open exception' )
7375
7476    @staticmethod  
75-     async  def  mock_exponential_backoff_retry ( * _ , ** __ ):
77+     async  def  exponential_backoff_retry_fail_upload ( fct :  t . Callable [...,  t . Any ],  * args , ** kwargs ):
7678        from  aiida .common .exceptions  import  TransportTaskException 
7779
78-         raise  TransportTaskException 
80+         if  'do_upload'  in  fct .__name__ :
81+             raise  TransportTaskException 
82+         else :
83+             return  await  exponential_backoff_retry (fct , * args , ** kwargs )
84+ 
85+     @staticmethod  
86+     async  def  exponential_backoff_retry_fail_kill (fct : t .Callable [..., t .Any ], * args , ** kwargs ):
87+         from  aiida .common .exceptions  import  TransportTaskException 
88+ 
89+         if  'do_kill'  in  fct .__name__ :
90+             raise  TransportTaskException 
91+         else :
92+             return  await  exponential_backoff_retry (fct , * args , ** kwargs )
7993
8094
8195@pytest .fixture (scope = 'function' ) 
@@ -213,11 +227,12 @@ def make_a_builder(sleep_seconds=0):
213227
214228@pytest .mark .requires_rmq  
215229@pytest .mark .usefixtures ('started_daemon_client' ) 
216- def  test_process_kill_failng_ebm (
230+ def  test_process_kill_failing_ebm_upload (
217231    fork_worker_context , submit_and_await , aiida_code_installed , run_cli_command , monkeypatch 
218232):
219-     """9) Kill a process that is paused after EBM (5 times failed). It should be possible to kill it normally. 
220-     # (e.g. in scenarios that transport is working again) 
233+     """Kill a process that is waiting after failed EBM during upload. It should be possible to kill it normally. 
234+ 
235+     A process that failed upload (e.g. in scenarios that transport is working again) and is then killed with 
221236    """ 
222237    from  aiida .orm  import  Int 
223238
@@ -232,7 +247,10 @@ def make_a_builder(sleep_seconds=0):
232247
233248    kill_timeout  =  10 
234249
235-     monkeypatch_args  =  ('aiida.engine.utils.exponential_backoff_retry' , MockFunctions .mock_exponential_backoff_retry )
250+     monkeypatch_args  =  (
251+         'aiida.engine.utils.exponential_backoff_retry' ,
252+         MockFunctions .exponential_backoff_retry_fail_upload ,
253+     )
236254    with  fork_worker_context (monkeypatch .setattr , monkeypatch_args ):
237255        node  =  submit_and_await (make_a_builder (), ProcessState .WAITING )
238256        await_condition (
@@ -241,11 +259,56 @@ def make_a_builder(sleep_seconds=0):
241259            timeout = kill_timeout ,
242260        )
243261
244-         # should restart EBM and be again not successful 
262+         # kill should start EBM and should successfully kill 
263+         run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
264+         await_condition (lambda : node .is_killed , timeout = kill_timeout )
265+ 
266+ 
267+ @pytest .mark .requires_rmq  
268+ @pytest .mark .usefixtures ('started_daemon_client' ) 
269+ def  test_process_kill_failing_ebm_kill (
270+     fork_worker_context , submit_and_await , aiida_code_installed , run_cli_command , monkeypatch 
271+ ):
272+     """Kill a process that with a failng EBM during the kill. 
273+ 
274+     Killing a process tries to gracefully cancel the job on the remote node. If there are connection problems it retries 
275+     it in using the EBM. If this fails another kill command can be send to restart the cancelation of the job scheduler. 
276+     """ 
277+     from  aiida .orm  import  Int 
278+ 
279+     code  =  aiida_code_installed (default_calc_job_plugin = 'core.arithmetic.add' , filepath_executable = '/bin/bash' )
280+ 
281+     def  make_a_builder (sleep_seconds = 0 ):
282+         builder  =  code .get_builder ()
283+         builder .x  =  Int (1 )
284+         builder .y  =  Int (1 )
285+         builder .metadata .options .sleep  =  sleep_seconds 
286+         return  builder 
287+ 
288+     kill_timeout  =  10 
289+ 
290+     monkeypatch_args  =  (
291+         'aiida.engine.utils.exponential_backoff_retry' ,
292+         MockFunctions .exponential_backoff_retry_fail_kill ,
293+     )
294+     # from aiida.engine.utils import exponential_backoff_retry 
295+     # monkeypatch_args = ('aiida.engine.utils.exponential_backoff_retry', exponential_backoff_retry) 
296+     with  fork_worker_context (monkeypatch .setattr , monkeypatch_args ):
297+         node  =  submit_and_await (make_a_builder (kill_timeout  +  10 ), ProcessState .WAITING , timeout = kill_timeout )
298+         await_condition (
299+             lambda : node .process_status  ==  'Monitoring scheduler: job state RUNNING' ,
300+             timeout = kill_timeout ,
301+         )
302+ 
303+         # kill should start EBM and be not successful in EBM 
304+         run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
305+         await_condition (lambda : not  node .is_killed , timeout = kill_timeout )
306+ 
307+         # kill should restart EBM and be not successful in EBM 
245308        run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
246309        await_condition (lambda : not  node .is_killed , timeout = kill_timeout )
247310
248-         # should skip EBM and successfully kill the process 
311+         # force kill  should skip EBM and successfully kill the process 
249312        run_cli_command (cmd_process .process_kill , [str (node .pk ), '-F' , '--wait' ])
250313        await_condition (lambda : node .is_killed , timeout = kill_timeout )
251314
0 commit comments