|
6 | 6 | import inspect |
7 | 7 | import logging |
8 | 8 | import os |
9 | | -import pickle |
10 | 9 | import random |
11 | 10 | import sys |
12 | 11 | import threading |
@@ -101,8 +100,6 @@ def __init__(self, config: Config) -> None: |
101 | 100 |
|
102 | 101 | logger.info("Parsl version: {}".format(get_version())) |
103 | 102 |
|
104 | | - self.checkpoint_lock = threading.Lock() |
105 | | - |
106 | 103 | self.usage_tracker = UsageTracker(self) |
107 | 104 | self.usage_tracker.send_start_message() |
108 | 105 |
|
@@ -176,7 +173,8 @@ def __init__(self, config: Config) -> None: |
176 | 173 | checkpoint_files = [] |
177 | 174 |
|
178 | 175 | self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files) |
179 | | - self.checkpointed_tasks = 0 |
| 176 | + self.memoizer.run_dir = self.run_dir |
| 177 | + |
180 | 178 | self._checkpoint_timer = None |
181 | 179 | self.checkpoint_mode = config.checkpoint_mode |
182 | 180 |
|
@@ -569,7 +567,7 @@ def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None: |
569 | 567 | # Do we need to checkpoint now, or queue for later, |
570 | 568 | # or do nothing? |
571 | 569 | if self.checkpoint_mode == 'task_exit': |
572 | | - self.checkpoint(tasks=[task_record]) |
| 570 | + self.memoizer.checkpoint(tasks=[task_record]) |
573 | 571 | elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'): |
574 | 572 | with self._modify_checkpointable_tasks_lock: |
575 | 573 | self.checkpointable_tasks.append(task_record) |
@@ -1210,7 +1208,7 @@ def cleanup(self) -> None: |
1210 | 1208 |
|
1211 | 1209 | # TODO: accesses to self.checkpointable_tasks should happen |
1212 | 1210 | # under a lock? |
1213 | | - self.checkpoint(self.checkpointable_tasks) |
| 1211 | + self.memoizer.checkpoint(self.checkpointable_tasks) |
1214 | 1212 |
|
1215 | 1213 | if self._checkpoint_timer: |
1216 | 1214 | logger.info("Stopping checkpoint timer") |
@@ -1274,66 +1272,9 @@ def cleanup(self) -> None: |
1274 | 1272 |
|
1275 | 1273 | def invoke_checkpoint(self) -> None: |
1276 | 1274 | with self._modify_checkpointable_tasks_lock: |
1277 | | - self.checkpoint(self.checkpointable_tasks) |
| 1275 | + self.memoizer.checkpoint(self.checkpointable_tasks) |
1278 | 1276 | self.checkpointable_tasks = [] |
1279 | 1277 |
|
1280 | | - def checkpoint(self, tasks: Sequence[TaskRecord]) -> None: |
1281 | | - """Checkpoint the dfk incrementally to a checkpoint file. |
1282 | | -
|
1283 | | - When called, every task that has been completed yet not |
1284 | | - checkpointed is checkpointed to a file. |
1285 | | -
|
1286 | | - Kwargs: |
1287 | | - - tasks (List of task records) : List of task ids to checkpoint. Default=None |
1288 | | - if set to None, we iterate over all tasks held by the DFK. |
1289 | | -
|
1290 | | - .. note:: |
1291 | | - Checkpointing only works if memoization is enabled |
1292 | | -
|
1293 | | - Returns: |
1294 | | - Checkpoint dir if checkpoints were written successfully. |
1295 | | - By default the checkpoints are written to the RUNDIR of the current |
1296 | | - run under RUNDIR/checkpoints/tasks.pkl |
1297 | | - """ |
1298 | | - with self.checkpoint_lock: |
1299 | | - checkpoint_queue = tasks |
1300 | | - |
1301 | | - checkpoint_dir = '{0}/checkpoint'.format(self.run_dir) |
1302 | | - checkpoint_tasks = checkpoint_dir + '/tasks.pkl' |
1303 | | - |
1304 | | - if not os.path.exists(checkpoint_dir): |
1305 | | - os.makedirs(checkpoint_dir, exist_ok=True) |
1306 | | - |
1307 | | - count = 0 |
1308 | | - |
1309 | | - with open(checkpoint_tasks, 'ab') as f: |
1310 | | - for task_record in checkpoint_queue: |
1311 | | - task_id = task_record['id'] |
1312 | | - |
1313 | | - app_fu = task_record['app_fu'] |
1314 | | - |
1315 | | - if app_fu.done() and app_fu.exception() is None: |
1316 | | - hashsum = task_record['hashsum'] |
1317 | | - if not hashsum: |
1318 | | - continue |
1319 | | - t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()} |
1320 | | - |
1321 | | - # We are using pickle here since pickle dumps to a file in 'ab' |
1322 | | - # mode behave like a incremental log. |
1323 | | - pickle.dump(t, f) |
1324 | | - count += 1 |
1325 | | - logger.debug("Task {} checkpointed".format(task_id)) |
1326 | | - |
1327 | | - self.checkpointed_tasks += count |
1328 | | - |
1329 | | - if count == 0: |
1330 | | - if self.checkpointed_tasks == 0: |
1331 | | - logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled") |
1332 | | - else: |
1333 | | - logger.debug("No tasks checkpointed in this pass.") |
1334 | | - else: |
1335 | | - logger.info("Done checkpointing {} tasks".format(count)) |
1336 | | - |
1337 | 1278 | @staticmethod |
1338 | 1279 | def _log_std_streams(task_record: TaskRecord) -> None: |
1339 | 1280 | tid = task_record['id'] |
|
0 commit comments