-
Notifications
You must be signed in to change notification settings - Fork 211
Move a lot of checkpointing code into Memoizer, from DFK #3986
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,15 +4,18 @@ | |
| import logging | ||
| import os | ||
| import pickle | ||
| import threading | ||
| import types | ||
| from concurrent.futures import Future | ||
| from functools import lru_cache, singledispatch | ||
| from typing import Any, Dict, List, Optional, Sequence | ||
| from typing import Any, Dict, List, Literal, Optional, Sequence | ||
|
|
||
| import typeguard | ||
|
|
||
| from parsl.dataflow.errors import BadCheckpoint | ||
| from parsl.dataflow.taskrecord import TaskRecord | ||
| from parsl.errors import ConfigurationError, InternalConsistencyError | ||
| from parsl.utils import Timer, get_all_checkpoints | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
@@ -146,7 +149,13 @@ class Memoizer: | |
|
|
||
| """ | ||
|
|
||
| def __init__(self, *, memoize: bool = True, checkpoint_files: Sequence[str]): | ||
| run_dir: str | ||
|
|
||
| def __init__(self, *, | ||
| memoize: bool = True, | ||
| checkpoint_files: Sequence[str] | None, | ||
| checkpoint_period: Optional[str], | ||
| checkpoint_mode: Literal['task_exit', 'periodic', 'dfk_exit', 'manual'] | None): | ||
| """Initialize the memoizer. | ||
|
|
||
| KWargs: | ||
|
|
@@ -155,6 +164,26 @@ def __init__(self, *, memoize: bool = True, checkpoint_files: Sequence[str]): | |
| """ | ||
| self.memoize = memoize | ||
|
|
||
| self.checkpointed_tasks = 0 | ||
|
|
||
| self.checkpoint_lock = threading.Lock() | ||
|
|
||
| self.checkpoint_files = checkpoint_files | ||
| self.checkpoint_mode = checkpoint_mode | ||
| self.checkpoint_period = checkpoint_period | ||
|
|
||
| self.checkpointable_tasks: List[TaskRecord] = [] | ||
|
|
||
| self._checkpoint_timer: Timer | None = None | ||
|
|
||
| def start(self) -> None: | ||
| if self.checkpoint_files is not None: | ||
| checkpoint_files = self.checkpoint_files | ||
| elif self.checkpoint_files is None and self.checkpoint_mode is not None: | ||
| checkpoint_files = get_all_checkpoints(self.run_dir) | ||
| else: | ||
| checkpoint_files = [] | ||
|
|
||
| checkpoint = self.load_checkpoints(checkpoint_files) | ||
|
|
||
| if self.memoize: | ||
|
|
@@ -164,6 +193,26 @@ def __init__(self, *, memoize: bool = True, checkpoint_files: Sequence[str]): | |
| logger.info("App caching disabled for all apps") | ||
| self.memo_lookup_table = {} | ||
|
|
||
| if self.checkpoint_mode == "periodic": | ||
| if self.checkpoint_period is None: | ||
| raise ConfigurationError("Checkpoint period must be specified with periodic checkpoint mode") | ||
| else: | ||
|
Comment on lines
+197
to
+199
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess the Meanwhile, stylistically, given the [no-return] semantic of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's probably weird to change it at all, and the validation can go into the |
||
| try: | ||
| h, m, s = map(int, self.checkpoint_period.split(':')) | ||
| except Exception: | ||
| raise ConfigurationError("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(self.checkpoint_period)) | ||
|
Comment on lines
+202
to
+203
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any sense in It would be nice to ensure that the this field is valid much earlier. Perhaps in a setter-style setup? |
||
| checkpoint_period = (h * 3600) + (m * 60) + s | ||
| self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint") | ||
|
|
||
| def close(self) -> None: | ||
| if self.checkpoint_mode is not None: | ||
| logger.info("Making final checkpoint") | ||
| self.checkpoint() | ||
|
|
||
| if self._checkpoint_timer: | ||
| logger.info("Stopping checkpoint timer") | ||
| self._checkpoint_timer.close() | ||
|
|
||
| def make_hash(self, task: TaskRecord) -> str: | ||
| """Create a hash of the task inputs. | ||
|
|
||
|
|
@@ -324,3 +373,78 @@ def load_checkpoints(self, checkpointDirs: Optional[Sequence[str]]) -> Dict[str, | |
| return self._load_checkpoints(checkpointDirs) | ||
| else: | ||
| return {} | ||
|
|
||
| def update_checkpoint(self, task_record: TaskRecord) -> None: | ||
| if self.checkpoint_mode == 'task_exit': | ||
| self.checkpoint(task=task_record) | ||
| elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'): | ||
| with self.checkpoint_lock: | ||
| self.checkpointable_tasks.append(task_record) | ||
| elif self.checkpoint_mode is None: | ||
| pass | ||
| else: | ||
| raise InternalConsistencyError(f"Invalid checkpoint mode {self.checkpoint_mode}") | ||
|
|
||
| def checkpoint(self, *, task: Optional[TaskRecord] = None) -> None: | ||
| """Checkpoint the dfk incrementally to a checkpoint file. | ||
|
|
||
| When called with no argument, all tasks registered in self.checkpointable_tasks | ||
| will be checkpointed. When called with a single TaskRecord argument, that task will be | ||
| checkpointed. | ||
|
|
||
| By default the checkpoints are written to the RUNDIR of the current | ||
| run under RUNDIR/checkpoints/tasks.pkl | ||
|
|
||
| Kwargs: | ||
| - task (Optional task records) : A task to checkpoint. Default=None, meaning all | ||
| tasks registered for checkpointing. | ||
|
|
||
| .. note:: | ||
| Checkpointing only works if memoization is enabled | ||
|
|
||
| """ | ||
| with self.checkpoint_lock: | ||
|
|
||
| if task: | ||
| checkpoint_queue = [task] | ||
| else: | ||
| checkpoint_queue = self.checkpointable_tasks | ||
|
|
||
| checkpoint_dir = '{0}/checkpoint'.format(self.run_dir) | ||
| checkpoint_tasks = checkpoint_dir + '/tasks.pkl' | ||
|
|
||
| if not os.path.exists(checkpoint_dir): | ||
| os.makedirs(checkpoint_dir, exist_ok=True) | ||
|
|
||
| count = 0 | ||
|
|
||
| with open(checkpoint_tasks, 'ab') as f: | ||
| for task_record in checkpoint_queue: | ||
| task_id = task_record['id'] | ||
|
|
||
| app_fu = task_record['app_fu'] | ||
|
|
||
| if app_fu.done() and app_fu.exception() is None: | ||
| hashsum = task_record['hashsum'] | ||
| if not hashsum: | ||
| continue | ||
| t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()} | ||
|
|
||
| # We are using pickle here since pickle dumps to a file in 'ab' | ||
| # mode behave like a incremental log. | ||
| pickle.dump(t, f) | ||
| count += 1 | ||
| logger.debug("Task {} checkpointed".format(task_id)) | ||
|
|
||
| self.checkpointed_tasks += count | ||
|
|
||
| if count == 0: | ||
| if self.checkpointed_tasks == 0: | ||
| logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled") | ||
| else: | ||
| logger.debug("No tasks checkpointed in this pass.") | ||
| else: | ||
| logger.info("Done checkpointing {} tasks".format(count)) | ||
|
|
||
| if not task: | ||
| self.checkpointable_tasks = [] | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why make
run_diran optional instance member? For example, ifcheckpoint_filesandcheckpoint_modeare the right state and the consumer forgets to externally set it before calling.start()... seems like a potential traceback?As opposed to, say, choosing a default value, or ensuring that the mistake is caught at instantiation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there's two initializations in the parsl style of doing configuration/plugins: the user creates the object, and so user-specified parameters go into
__init__. Secondly, there's the framework that is the DFK that invents runtime stuff like the rundir. By the time this happens,__init__is long in the past - perhaps in a different source file and well before the DFK even begins to exist.So it's probably right to make this be mandatory parameter of
start().The plugin style in general doesn't do well in differentiating between "uninitialised, configured" (pre-
start) plugins and "intialised" (post-start) objects. That differentiation was something that was forced with monitoring plugin radios, because the "configured, uninitialized" state needs to be moved around over the network. And if I was going to re-implement this object model, I'd probably try to make that separation happen everywhere. But that's not how Parsl is right now.