@@ -386,8 +386,12 @@ def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[A
386386 data = pickle .load (f )
387387 # Copy and hash only the input attributes
388388 memo_fu : Future = Future ()
389- assert data ['exception' ] is None
390- memo_fu .set_result (data ['result' ])
389+
390+ if data ['exception' ] is None :
391+ memo_fu .set_result (data ['result' ])
392+ else :
393+ assert data ['result' ] is None
394+ memo_fu .set_exception (data ['exception' ])
391395 memo_lookup_table [data ['hash' ]] = memo_fu
392396
393397 except EOFError :
@@ -470,20 +474,22 @@ def checkpoint(self, *, task: Optional[TaskRecord] = None, exception: Optional[B
470474 # TODO: refactor with below
471475
472476 task_id = task ['id' ]
473-
474- if exception is None :
475- hashsum = task ['hashsum' ]
476- if not hashsum :
477- pass # TODO: log an error? see below discussion
478- else :
477+ hashsum = task ['hashsum' ]
478+ if not hashsum :
479+ pass # TODO: log an error? see below discussion
480+ else :
481+ if exception is None and self .filter_result_for_checkpoint (result ):
479482 t = {'hash' : hashsum , 'exception' : None , 'result' : result }
480-
481- # We are using pickle here since pickle dumps to a file in 'ab'
482- # mode behave like a incremental log.
483483 pickle .dump (t , f )
484484 count += 1
485-
486- logger .debug ("Task {} checkpointed" .format (task_id ))
485+ logger .debug ("Task {} checkpointed result" .format (task_id ))
486+ elif exception is not None and self .filter_exception_for_checkpoint (exception ):
487+ t = {'hash' : hashsum , 'exception' : exception , 'result' : None }
488+ pickle .dump (t , f )
489+ count += 1
490+ logger .debug ("Task {} checkpointed exception" .format (task_id ))
491+ else :
492+ pass # no checkpoint - maybe debug log? TODO
487493 else :
488494 checkpoint_queue = self .checkpointable_tasks
489495
@@ -494,18 +500,22 @@ def checkpoint(self, *, task: Optional[TaskRecord] = None, exception: Optional[B
494500
495501 assert app_fu .done (), "trying to checkpoint a task that is not done"
496502
497- if app_fu .done () and app_fu .exception () is None :
498- hashsum = task_record ['hashsum' ]
499- if not hashsum :
500- continue # TODO: log an error? maybe some tasks don't have hashsums legitimately?
501- t = {'hash' : hashsum , 'exception' : None , 'result' : app_fu .result ()}
503+ hashsum = task_record ['hashsum' ]
504+ if not hashsum :
505+ continue # TODO: log an error? maybe some tasks don't have hashsums legitimately?
502506
503- # We are using pickle here since pickle dumps to a file in 'ab'
504- # mode behave like a incremental log.
507+ if app_fu . exception () is None and self . filter_result_for_checkpoint ( app_fu . result ()):
508+ t = { 'hash' : hashsum , 'exception' : None , 'result' : app_fu . result ()}
505509 pickle .dump (t , f )
506510 count += 1
507-
508- logger .debug ("Task {} checkpointed" .format (task_id ))
511+ logger .debug ("Task {} checkpointed result" .format (task_id ))
512+ elif (e := app_fu .exception ()) is not None and self .filter_exception_for_checkpoint (e ):
513+ t = {'hash' : hashsum , 'exception' : app_fu .exception (), 'result' : None }
514+ pickle .dump (t , f )
515+ count += 1
516+ logger .debug ("Task {} checkpointed exception" .format (task_id ))
517+ else :
518+ pass # TODO: maybe log at debug level
509519
510520 self .checkpointed_tasks += count
511521
@@ -523,3 +533,11 @@ def checkpoint(self, *, task: Optional[TaskRecord] = None, exception: Optional[B
523533 # Or maybe a failure of iteration if the list is appended to while checkpointing is happening?
524534 if not task :
525535 self .checkpointable_tasks = []
536+
537+ def filter_result_for_checkpoint (self , result : Any ) -> bool :
538+ """Overridable method to decide if an task that ended with a successful result should be checkpointed"""
539+ return True
540+
541+ def filter_exception_for_checkpoint (self , exception : BaseException ) -> bool :
542+ """Overridable method to decide if an entry that ended with an exception should be checkpointed"""
543+ return False
0 commit comments