Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
fe0004c
Implement exception version of complete_task_result factorisation
benclifford Oct 15, 2025
1ebfdc5
Collapse two nested ifs into a single multiway if
benclifford Oct 15, 2025
141ae2e
Make dependency failure handling more consistent
benclifford Oct 15, 2025
c2818ec
Test correct increase in dep_fail count
benclifford Oct 15, 2025
62693d4
Replace an always-true `if` with an assert
benclifford Oct 15, 2025
ad1416b
Pull more completion code into _complete_task and rename
benclifford Oct 15, 2025
513d2bb
Remove spurious double set of task time_returned
benclifford Oct 15, 2025
b0e8b70
Remove spurious double set of task time_returned
benclifford Oct 15, 2025
91cce04
Move a lot of checkpointing code into Memoizer, from DFK
benclifford Oct 10, 2025
b35a095
Tell memoizer the task result earlier, to fix race condition #3762
benclifford Sep 3, 2025
f4c352d
Inline a single-use test config, from its own file
benclifford Sep 29, 2025
79ee1da
Remove commented dead code
benclifford Sep 29, 2025
757986c
Remove special-case checkpoint case from garbage collector test
benclifford Sep 29, 2025
52f9bbc
inline a test config that is only used in one test and is very specif…
benclifford Sep 29, 2025
eea1819
Remove unused memoizer->dfk reference
benclifford Sep 29, 2025
443500a
Remove a spurious comment on checkpointing shutdown
benclifford Sep 29, 2025
7cc81ac
Remove redundant argument from update_memo
benclifford Sep 3, 2025
2b206b5
TODO: factor a complete_exception method, like complete_task. and mak…
benclifford Sep 29, 2025
ebedfa9
make checkpoint move away from always using futures for result value
benclifford Sep 29, 2025
507f778
remove handle app update
benclifford Oct 17, 2025
f58f27b
make memoizer into an interface class and impls
benclifford Jul 19, 2024
bc67fd8
this framework is a bit over-generic and doesn't catch type errors.
benclifford Sep 29, 2025
c2c4cf4
configurable memoizer instance
benclifford Jul 19, 2024
83b8fb6
checkpoint exceptions
benclifford Jul 30, 2024
c75308e
make hash does not need to be part of basic memoizer
benclifford Aug 22, 2024
21f0c49
rework exception tests to be BasicMemoizer tests
benclifford Oct 7, 2025
a9cb522
out-of-memory checkpointing
benclifford Aug 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions parsl/benchmark/perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def performance(*, resources: dict, target_t: float, args_extra_size: int, itera

iteration = 1

args_extra_payload = "x" * args_extra_size
# args_extra_payload = "x" * args_extra_size

iterate = True

Expand All @@ -59,7 +59,13 @@ def performance(*, resources: dict, target_t: float, args_extra_size: int, itera

fs = []
print("Submitting tasks / invoking apps")
for _ in range(n):
for index in range(n):
# this means there is a different argument for each iteration,
# which will make checkpointing/memo behave differently
# so this could be switchable in parsl-perf dev branch
# args_extra_payload = index # always a new one (except for run repeats)

args_extra_payload = index % 10
fs.append(app(args_extra_payload, parsl_resource_specification=resources))

submitted_t = time.time()
Expand Down
37 changes: 3 additions & 34 deletions parsl/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing_extensions import Literal

from parsl.dataflow.dependency_resolvers import DependencyResolver
from parsl.dataflow.memoization import Memoizer
from parsl.dataflow.taskrecord import TaskRecord
from parsl.errors import ConfigurationError
from parsl.executors.base import ParslExecutor
Expand All @@ -27,17 +28,6 @@ class Config(RepresentationMixin, UsageInformation):
executors : sequence of ParslExecutor, optional
List (or other iterable) of `ParslExecutor` instances to use for executing tasks.
Default is (:class:`~parsl.executors.threads.ThreadPoolExecutor()`,).
app_cache : bool, optional
Enable app caching. Default is True.
checkpoint_files : sequence of str, optional
List of paths to checkpoint files. See :func:`parsl.utils.get_all_checkpoints` and
:func:`parsl.utils.get_last_checkpoint` for helpers. Default is None.
checkpoint_mode : str, optional
Checkpoint mode to use, can be ``'dfk_exit'``, ``'task_exit'``, ``'periodic'`` or ``'manual'``.
If set to `None`, checkpointing will be disabled. Default is None.
checkpoint_period : str, optional
Time interval (in "HH:MM:SS") at which to checkpoint completed tasks. Only has an effect if
``checkpoint_mode='periodic'``.
dependency_resolver: plugin point for custom dependency resolvers. Default: only resolve Futures,
using the `SHALLOW_DEPENDENCY_RESOLVER`.
exit_mode: str, optional
Expand Down Expand Up @@ -100,14 +90,7 @@ class Config(RepresentationMixin, UsageInformation):
@typeguard.typechecked
def __init__(self,
executors: Optional[Iterable[ParslExecutor]] = None,
app_cache: bool = True,
checkpoint_files: Optional[Sequence[str]] = None,
checkpoint_mode: Union[None,
Literal['task_exit'],
Literal['periodic'],
Literal['dfk_exit'],
Literal['manual']] = None,
checkpoint_period: Optional[str] = None,
memoizer: Optional[Memoizer] = None,
dependency_resolver: Optional[DependencyResolver] = None,
exit_mode: Literal['cleanup', 'skip', 'wait'] = 'cleanup',
garbage_collect: bool = True,
Expand All @@ -131,21 +114,7 @@ def __init__(self,
self._executors: Sequence[ParslExecutor] = executors
self._validate_executors()

self.app_cache = app_cache
self.checkpoint_files = checkpoint_files
self.checkpoint_mode = checkpoint_mode
if checkpoint_period is not None:
if checkpoint_mode is None:
logger.debug('The requested `checkpoint_period={}` will have no effect because `checkpoint_mode=None`'.format(
checkpoint_period)
)
elif checkpoint_mode != 'periodic':
logger.debug("Requested checkpoint period of {} only has an effect with checkpoint_mode='periodic'".format(
checkpoint_period)
)
if checkpoint_mode == 'periodic' and checkpoint_period is None:
checkpoint_period = "00:30:00"
self.checkpoint_period = checkpoint_period
self.memoizer = memoizer
self.dependency_resolver = dependency_resolver
self.exit_mode = exit_mode
self.garbage_collect = garbage_collect
Expand Down
4 changes: 2 additions & 2 deletions parsl/configs/ASPIRE1.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from parsl.addresses import address_by_interface
from parsl.config import Config
from parsl.dataflow.memoization import BasicMemoizer
from parsl.executors import HighThroughputExecutor
from parsl.launchers import MpiRunLauncher
from parsl.monitoring.monitoring import MonitoringHub
Expand Down Expand Up @@ -38,7 +39,6 @@
),
strategy='simple',
retries=3,
app_cache=True,
checkpoint_mode='task_exit',
memoizer=BasicMemoizer(checkpoint_mode='task_exit'),
usage_tracking=LEVEL_1,
)
94 changes: 36 additions & 58 deletions parsl/dataflow/dflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from parsl.dataflow.dependency_resolvers import SHALLOW_DEPENDENCY_RESOLVER
from parsl.dataflow.errors import DependencyError, JoinError
from parsl.dataflow.futures import AppFuture
from parsl.dataflow.memoization import Memoizer
from parsl.dataflow.memoization import BasicMemoizer, Memoizer
from parsl.dataflow.rundirs import make_rundir
from parsl.dataflow.states import FINAL_FAILURE_STATES, FINAL_STATES, States
from parsl.dataflow.taskrecord import TaskRecord
Expand Down Expand Up @@ -165,12 +165,8 @@ def __init__(self, config: Config) -> None:
self.monitoring_radio.send((MessageType.WORKFLOW_INFO,
workflow_info))

self.memoizer = Memoizer(memoize=config.app_cache,
checkpoint_mode=config.checkpoint_mode,
checkpoint_files=config.checkpoint_files,
checkpoint_period=config.checkpoint_period)
self.memoizer.run_dir = self.run_dir
self.memoizer.start()
self.memoizer: Memoizer = config.memoizer if config.memoizer is not None else BasicMemoizer()
self.memoizer.start(run_dir=self.run_dir)

# this must be set before executors are added since add_executors calls
# job_status_poller.add_executors.
Expand Down Expand Up @@ -352,14 +348,8 @@ def handle_exec_update(self, task_record: TaskRecord, future: Future) -> None:
task_record['fail_cost'] += 1

if isinstance(e, DependencyError):
# was this sending two task log infos? if so would I see the row twice in the monitoring db?
self.update_task_state(task_record, States.dep_fail)
logger.info("Task {} failed due to dependency failure so skipping retries".format(task_id))
task_record['time_returned'] = datetime.datetime.now()
self._send_task_log_info(task_record)
self.memoizer.update_memo(task_record)
with task_record['app_fu']._update_lock:
task_record['app_fu'].set_exception(e)
self._complete_task_exception(task_record, States.dep_fail, e)

elif task_record['fail_cost'] <= self._config.retries:

Expand All @@ -379,12 +369,7 @@ def handle_exec_update(self, task_record: TaskRecord, future: Future) -> None:
else:
logger.exception("Task {} failed after {} retry attempts".format(task_id,
task_record['try_id']))
self.update_task_state(task_record, States.failed)
task_record['time_returned'] = datetime.datetime.now()
self._send_task_log_info(task_record)
self.memoizer.update_memo(task_record)
with task_record['app_fu']._update_lock:
task_record['app_fu'].set_exception(e)
self._complete_task_exception(task_record, States.failed, e)

else:
if task_record['from_memo']:
Expand Down Expand Up @@ -422,13 +407,10 @@ def handle_exec_update(self, task_record: TaskRecord, future: Future) -> None:
for inner_future in joinable:
inner_future.add_done_callback(partial(self.handle_join_update, task_record))
else:
self.update_task_state(task_record, States.failed)
task_record['time_returned'] = datetime.datetime.now()
self._send_task_log_info(task_record)
self.memoizer.update_memo(task_record)
with task_record['app_fu']._update_lock:
task_record['app_fu'].set_exception(
TypeError(f"join_app body must return a Future or list of Futures, got {joinable} of type {type(joinable)}"))
self._complete_task_exception(
task_record,
States.failed,
TypeError(f"join_app body must return a Future or list of Futures, got {joinable} of type {type(joinable)}"))

self._log_std_streams(task_record)

Expand Down Expand Up @@ -499,12 +481,7 @@ def handle_join_update(self, task_record: TaskRecord, inner_app_future: Optional
# no need to update the fail cost because join apps are never
# retried

self.update_task_state(task_record, States.failed)
task_record['time_returned'] = datetime.datetime.now()
self.memoizer.update_memo(task_record)
with task_record['app_fu']._update_lock:
task_record['app_fu'].set_exception(e)
self._send_task_log_info(task_record)
self._complete_task_exception(task_record, States.failed, e)

else:
# all the joinables succeeded, so construct a result:
Expand All @@ -521,49 +498,47 @@ def handle_join_update(self, task_record: TaskRecord, inner_app_future: Optional

self._log_std_streams(task_record)

def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None:
"""This function is called as a callback when an AppFuture
is in its final state.

It will trigger post-app processing such as checkpointing.
def _complete_task_result(self, task_record: TaskRecord, new_state: States, result: Any) -> None:
"""Set a task into a completed state
"""
assert new_state in FINAL_STATES
assert new_state not in FINAL_FAILURE_STATES
old_state = task_record['status']

Args:
task_record : Task record
future (Future) : The relevant app future (which should be
consistent with the task structure 'app_fu' entry
self.update_task_state(task_record, new_state)

"""
logger.info(f"Task {task_record['id']} completed ({old_state.name} -> {new_state.name})")
task_record['time_returned'] = datetime.datetime.now()

task_id = task_record['id']
self.memoizer.update_memo_result(task_record, result)

if not task_record['app_fu'].done():
logger.error("Internal consistency error: app_fu is not done for task {}".format(task_id))
if not task_record['app_fu'] == future:
logger.error("Internal consistency error: callback future is not the app_fu in task structure, for task {}".format(task_id))
self._send_task_log_info(task_record)

self.memoizer.update_checkpoint(task_record)
self.wipe_task(task_record['id'])

self.wipe_task(task_id)
return
with task_record['app_fu']._update_lock:
task_record['app_fu'].set_result(result)

def _complete_task_result(self, task_record: TaskRecord, new_state: States, result: Any) -> None:
"""Set a task into a completed state
def _complete_task_exception(self, task_record: TaskRecord, new_state: States, exception: BaseException) -> None:
"""Set a task into a failure state
"""
assert new_state in FINAL_STATES
assert new_state not in FINAL_FAILURE_STATES
assert new_state in FINAL_FAILURE_STATES
old_state = task_record['status']

self.update_task_state(task_record, new_state)

logger.info(f"Task {task_record['id']} completed ({old_state.name} -> {new_state.name})")
logger.info(f"Task {task_record['id']} failed ({old_state.name} -> {new_state.name})")
task_record['time_returned'] = datetime.datetime.now()

self.memoizer.update_memo(task_record)
self.memoizer.update_memo_exception(task_record, exception)

self._send_task_log_info(task_record)

self.wipe_task(task_record['id'])

with task_record['app_fu']._update_lock:
task_record['app_fu'].set_result(result)
task_record['app_fu'].set_exception(exception)

def update_task_state(self, task_record: TaskRecord, new_state: States) -> None:
"""Updates a task record state, and recording an appropriate change
Expand Down Expand Up @@ -1053,7 +1028,6 @@ def submit(self,
task_record['func_name'],
waiting_message))

app_fu.add_done_callback(partial(self.handle_app_update, task_record))
self.update_task_state(task_record, States.pending)
logger.debug("Task {} set to pending state with AppFuture: {}".format(task_id, task_record['app_fu']))

Expand Down Expand Up @@ -1227,6 +1201,10 @@ def cleanup(self) -> None:
# should still see it.
logger.info("DFK cleanup complete")

# TODO: this should maybe go away: manual explicit checkponting is
# a property of the (upcoming) BasicMemoizer, not of a memoisation
# plugin in general -- configure a BasicMemoizer separately from the
# DFK and call checkpoint on that...
def checkpoint(self) -> None:
self.memoizer.checkpoint()

Expand Down
Loading