|
7 | 7 | import logging |
8 | 8 | import os |
9 | 9 | import pathlib |
10 | | -import pickle |
11 | 10 | import random |
12 | 11 | import sys |
13 | 12 | import threading |
@@ -99,8 +98,6 @@ def __init__(self, config: Config) -> None: |
99 | 98 |
|
100 | 99 | logger.info("Parsl version: {}".format(get_version())) |
101 | 100 |
|
102 | | - self.checkpoint_lock = threading.Lock() |
103 | | - |
104 | 101 | self.usage_tracker = UsageTracker(self) |
105 | 102 | self.usage_tracker.send_start_message() |
106 | 103 |
|
@@ -177,7 +174,8 @@ def __init__(self, config: Config) -> None: |
177 | 174 | checkpoint_files = [] |
178 | 175 |
|
179 | 176 | self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files) |
180 | | - self.checkpointed_tasks = 0 |
| 177 | + self.memoizer.run_dir = self.run_dir |
| 178 | + |
181 | 179 | self._checkpoint_timer = None |
182 | 180 | self.checkpoint_mode = config.checkpoint_mode |
183 | 181 |
|
@@ -575,7 +573,7 @@ def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None: |
575 | 573 | # Do we need to checkpoint now, or queue for later, |
576 | 574 | # or do nothing? |
577 | 575 | if self.checkpoint_mode == 'task_exit': |
578 | | - self.checkpoint(tasks=[task_record]) |
| 576 | + self.memoizer.checkpoint(tasks=[task_record]) |
579 | 577 | elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'): |
580 | 578 | with self._modify_checkpointable_tasks_lock: |
581 | 579 | self.checkpointable_tasks.append(task_record) |
@@ -1259,7 +1257,7 @@ def cleanup(self) -> None: |
1259 | 1257 |
|
1260 | 1258 | # TODO: accesses to self.checkpointable_tasks should happen |
1261 | 1259 | # under a lock? |
1262 | | - self.checkpoint(self.checkpointable_tasks) |
| 1260 | + self.memoizer.checkpoint(self.checkpointable_tasks) |
1263 | 1261 |
|
1264 | 1262 | if self._checkpoint_timer: |
1265 | 1263 | logger.info("Stopping checkpoint timer") |
@@ -1334,76 +1332,10 @@ def cleanup(self) -> None: |
1334 | 1332 |
|
1335 | 1333 | def invoke_checkpoint(self): |
1336 | 1334 | with self._modify_checkpointable_tasks_lock: |
1337 | | - r = self.checkpoint(self.checkpointable_tasks) |
| 1335 | + r = self.memoizer.checkpoint(self.checkpointable_tasks) |
1338 | 1336 | self.checkpointable_tasks = [] |
1339 | 1337 | return r |
1340 | 1338 |
|
1341 | | - def checkpoint(self, tasks: Sequence[TaskRecord]) -> str: |
1342 | | - """Checkpoint the dfk incrementally to a checkpoint file. |
1343 | | -
|
1344 | | - When called, every task that has been completed yet not |
1345 | | - checkpointed is checkpointed to a file. |
1346 | | -
|
1347 | | - Kwargs: |
1348 | | - - tasks (List of task records) : List of task ids to checkpoint. Default=None |
1349 | | - if set to None, we iterate over all tasks held by the DFK. |
1350 | | -
|
1351 | | - .. note:: |
1352 | | - Checkpointing only works if memoization is enabled |
1353 | | -
|
1354 | | - Returns: |
1355 | | - Checkpoint dir if checkpoints were written successfully. |
1356 | | - By default the checkpoints are written to the RUNDIR of the current |
1357 | | - run under RUNDIR/checkpoints/{tasks.pkl, dfk.pkl} |
1358 | | - """ |
1359 | | - with self.checkpoint_lock: |
1360 | | - checkpoint_queue = tasks |
1361 | | - |
1362 | | - checkpoint_dir = '{0}/checkpoint'.format(self.run_dir) |
1363 | | - checkpoint_dfk = checkpoint_dir + '/dfk.pkl' |
1364 | | - checkpoint_tasks = checkpoint_dir + '/tasks.pkl' |
1365 | | - |
1366 | | - if not os.path.exists(checkpoint_dir): |
1367 | | - os.makedirs(checkpoint_dir, exist_ok=True) |
1368 | | - |
1369 | | - with open(checkpoint_dfk, 'wb') as f: |
1370 | | - state = {'rundir': self.run_dir, |
1371 | | - 'task_count': self.task_count |
1372 | | - } |
1373 | | - pickle.dump(state, f) |
1374 | | - |
1375 | | - count = 0 |
1376 | | - |
1377 | | - with open(checkpoint_tasks, 'ab') as f: |
1378 | | - for task_record in checkpoint_queue: |
1379 | | - task_id = task_record['id'] |
1380 | | - |
1381 | | - app_fu = task_record['app_fu'] |
1382 | | - |
1383 | | - if app_fu.done() and app_fu.exception() is None: |
1384 | | - hashsum = task_record['hashsum'] |
1385 | | - if not hashsum: |
1386 | | - continue |
1387 | | - t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()} |
1388 | | - |
1389 | | - # We are using pickle here since pickle dumps to a file in 'ab' |
1390 | | - # mode behave like a incremental log. |
1391 | | - pickle.dump(t, f) |
1392 | | - count += 1 |
1393 | | - logger.debug("Task {} checkpointed".format(task_id)) |
1394 | | - |
1395 | | - self.checkpointed_tasks += count |
1396 | | - |
1397 | | - if count == 0: |
1398 | | - if self.checkpointed_tasks == 0: |
1399 | | - logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled") |
1400 | | - else: |
1401 | | - logger.debug("No tasks checkpointed in this pass.") |
1402 | | - else: |
1403 | | - logger.info("Done checkpointing {} tasks".format(count)) |
1404 | | - |
1405 | | - return checkpoint_dir |
1406 | | - |
1407 | 1339 | @staticmethod |
1408 | 1340 | def _log_std_streams(task_record: TaskRecord) -> None: |
1409 | 1341 | tid = task_record['id'] |
|
0 commit comments