88
99from streamflow .core .exception import FailureHandlingException
1010from streamflow .core .recovery import RecoveryPolicy
11- from streamflow .core .utils import get_tag
12- from streamflow .core .workflow import Job , Step , Token , Workflow
11+ from streamflow .core .utils import get_job_tag , get_tag
12+ from streamflow .core .workflow import Job , Status , Step , Token , Workflow
1313from streamflow .log_handler import logger
1414from streamflow .persistence .loading_context import WorkflowBuilder
1515from streamflow .recovery .utils import (
@@ -93,7 +93,7 @@ async def _inject_tokens(mapper: GraphMapper, new_workflow: Workflow) -> None:
9393 ):
9494 if logger .isEnabledFor (logging .DEBUG ):
9595 logger .debug (f"Injecting termination token on port { port .name } " )
96- port .put (TerminationToken ())
96+ port .put (TerminationToken (Status . SKIPPED ))
9797
9898
9999async def _populate_workflow (
@@ -111,14 +111,16 @@ async def _populate_workflow(
111111 for step_id in step_ids
112112 )
113113 )
114- # Add failed step into new_workflow
114+ # Add the failed step to the new workflow
115115 await workflow_builder .load_step (
116116 new_workflow .context ,
117117 failed_step .persistent_id ,
118118 )
119- # Instantiate ports capable of moving tokens across workflows
119+ # Instantiate ports that can transfer tokens between workflows
120120 for port in new_workflow .ports .values ():
121- if not isinstance (port , ConnectorPort ):
121+ if not isinstance (
122+ port , (ConnectorPort , InterWorkflowJobPort , InterWorkflowPort )
123+ ):
122124 new_workflow .create_port (
123125 (
124126 InterWorkflowJobPort
@@ -129,7 +131,7 @@ async def _populate_workflow(
129131 )
130132 for port in failed_step .get_output_ports ().values ():
131133 cast (InterWorkflowPort , new_workflow .ports [port .name ]).add_inter_port (
132- port , border_tag = get_tag (failed_job .inputs .values ())
134+ port , boundary_tag = get_tag (failed_job .inputs .values ()), terminate = False
133135 )
134136
135137
@@ -175,21 +177,23 @@ async def _recover_workflow(self, failed_job: Job, failed_step: Step) -> Workflo
175177 ]
176178 )
177179 mapper = await create_graph_mapper (self .context , provenance )
178- # Synchronize across multiple recovery workflows
180+ # Synchronize between multiple recovery workflows
179181 job_tokens = list (
180182 filter (lambda t : isinstance (t , JobToken ), mapper .token_instances .values ())
181183 )
182- await self ._sync_workflows (
183- {* (t .value .name for t in job_tokens ), failed_job .name },
184- job_tokens ,
185- mapper ,
186- new_workflow ,
184+ job_names = await self ._sync_workflows (
185+ job_names = {* (t .value .name for t in job_tokens ), failed_job .name },
186+ job_tokens = job_tokens ,
187+ mapper = mapper ,
188+ workflow = new_workflow ,
187189 )
188190 # Populate new workflow
189191 steps = await mapper .get_port_and_step_ids (failed_step .output_ports .values ())
190192 await _populate_workflow (
191193 steps , failed_step , new_workflow , workflow_builder , failed_job
192194 )
195+ for job_name in job_names :
196+ self .context .failure_manager .get_request (job_name ).workflow_ready .set ()
193197 await _inject_tokens (mapper , new_workflow )
194198 await _set_step_states (mapper , new_workflow )
195199 return new_workflow
@@ -200,7 +204,8 @@ async def _sync_workflows(
200204 job_tokens : MutableSequence [Token ],
201205 mapper : GraphMapper ,
202206 workflow : Workflow ,
203- ) -> None :
207+ ) -> MutableSequence [str ]:
208+ new_job_names = []
204209 for job_name in job_names :
205210 retry_request = self .context .failure_manager .get_request (job_name )
206211 if (
@@ -209,20 +214,35 @@ async def _sync_workflows(
209214 )
210215 ) == TokenAvailability .FutureAvailable :
211216 job_token = get_job_token (job_name , job_tokens )
212- # The `retry_request` is the current job running, instead
213- # the `job_token` is the token to remove in the graph because
214- # the workflow will depend on the already running job
217+ # `retry_request` represents the currently running job.
218+ # `job_token` refers to the token that needs to be removed from the graph,
219+ # as the workflow depends on the already running job.
215220 if logger .isEnabledFor (logging .DEBUG ):
216- logger .debug (f"Synchronize rollbacks: job { job_name } is running" )
217- # todo: create a unit test for this case
221+ if not (is_wf_ready := retry_request .workflow_ready .is_set ()):
222+ logger .debug (
223+ f"Synchronizing rollbacks: Job { job_name } is waiting for the rollback workflow to be ready."
224+ )
225+ else :
226+ logger .debug (
227+ f"Synchronizing rollbacks: Job { job_name } is currently executing."
228+ )
229+ else :
230+ is_wf_ready = True
231+ await retry_request .workflow_ready .wait ()
232+ if logger .isEnabledFor (logging .DEBUG ) and not is_wf_ready :
233+ logger .debug (
234+ f"Synchronizing rollbacks: Job { job_name } has resumed after the rollback workflow is ready."
235+ )
218236 for port_name in await mapper .get_output_ports (job_token ):
219237 if port_name in retry_request .workflow .ports .keys ():
220238 cast (
221239 InterWorkflowPort , retry_request .workflow .ports [port_name ]
222240 ).add_inter_port (
223- workflow .create_port (cls = InterWorkflowPort , name = port_name )
241+ workflow .create_port (cls = InterWorkflowPort , name = port_name ),
242+ boundary_tag = get_job_tag (job_token .value .name ),
243+ terminate = True ,
224244 )
225- # Remove tokens recovered in other workflows
245+ # Remove tokens that will be recovered in other workflows
226246 for token_id in await mapper .get_output_tokens (job_token .persistent_id ):
227247 mapper .remove_token (token_id , preserve_token = True )
228248 elif is_available == TokenAvailability .Available :
@@ -247,6 +267,9 @@ async def _sync_workflows(
247267 else :
248268 await self .context .failure_manager .update_request (job_name )
249269 retry_request .workflow = workflow
270+ retry_request .workflow_ready .clear ()
271+ new_job_names .append (job_name )
272+ return new_job_names
250273
251274 async def recover (self , failed_job : Job , failed_step : Step ) -> None :
252275 # Create recover workflow
0 commit comments