1111import typeguard
1212
1313from parsl .dataflow .errors import BadCheckpoint
14+ from parsl .dataflow .futures import AppFuture
1415from parsl .dataflow .taskrecord import TaskRecord
1516
1617if TYPE_CHECKING :
@@ -336,8 +337,12 @@ def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[A
336337 data = pickle .load (f )
337338 # Copy and hash only the input attributes
338339 memo_fu : Future = Future ()
339- assert data ['exception' ] is None
340- memo_fu .set_result (data ['result' ])
340+
341+ if data ['exception' ] is None :
342+ memo_fu .set_result (data ['result' ])
343+ else :
344+ assert data ['result' ] is None
345+ memo_fu .set_exception (data ['exception' ])
341346 memo_lookup_table [data ['hash' ]] = memo_fu
342347
343348 except EOFError :
@@ -411,17 +416,22 @@ def checkpoint(self, tasks: Sequence[TaskRecord]) -> str:
411416
412417 app_fu = task_record ['app_fu' ]
413418
414- if app_fu .done () and app_fu .exception () is None :
419+ if app_fu .done () and self .filter_for_checkpoint (app_fu ):
420+
415421 hashsum = task_record ['hashsum' ]
416422 if not hashsum :
417423 continue
418- t = {'hash' : hashsum , 'exception' : None , 'result' : app_fu .result ()}
424+
425+ if app_fu .exception () is None :
426+ t = {'hash' : hashsum , 'exception' : None , 'result' : app_fu .result ()}
427+ else :
428+ t = {'hash' : hashsum , 'exception' : app_fu .exception (), 'result' : None }
419429
420430 # We are using pickle here since pickle dumps to a file in 'ab'
421431 # mode behave like a incremental log.
422432 pickle .dump (t , f )
423433 count += 1
424- logger .debug ("Task {} checkpointed" .format (task_id ))
434+ logger .debug ("Task {} checkpointed as result " .format (task_id ))
425435
426436 self .checkpointed_tasks += count
427437
@@ -434,3 +444,7 @@ def checkpoint(self, tasks: Sequence[TaskRecord]) -> str:
434444 logger .info ("Done checkpointing {} tasks" .format (count ))
435445
436446 return checkpoint_dir
447+
448+ def filter_for_checkpoint (self , app_fu : AppFuture ) -> bool :
449+ """Overridable method to decide if an entry should be checkpointed"""
450+ return app_fu .exception () is None
0 commit comments