|
7 | 7 | import logging |
8 | 8 | import os |
9 | 9 | import pathlib |
10 | | -import pickle |
11 | 10 | import random |
12 | 11 | import sys |
13 | 12 | import threading |
@@ -99,8 +98,6 @@ def __init__(self, config: Config) -> None: |
99 | 98 |
|
100 | 99 | logger.info("Parsl version: {}".format(get_version())) |
101 | 100 |
|
102 | | - self.checkpoint_lock = threading.Lock() |
103 | | - |
104 | 101 | self.usage_tracker = UsageTracker(self) |
105 | 102 | self.usage_tracker.send_start_message() |
106 | 103 |
|
@@ -173,7 +170,8 @@ def __init__(self, config: Config) -> None: |
173 | 170 | checkpoint_files = [] |
174 | 171 |
|
175 | 172 | self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files) |
176 | | - self.checkpointed_tasks = 0 |
| 173 | + self.memoizer.run_dir = self.run_dir |
| 174 | + |
177 | 175 | self._checkpoint_timer = None |
178 | 176 | self.checkpoint_mode = config.checkpoint_mode |
179 | 177 |
|
@@ -571,7 +569,7 @@ def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None: |
571 | 569 | # Do we need to checkpoint now, or queue for later, |
572 | 570 | # or do nothing? |
573 | 571 | if self.checkpoint_mode == 'task_exit': |
574 | | - self.checkpoint(tasks=[task_record]) |
| 572 | + self.memoizer.checkpoint(tasks=[task_record]) |
575 | 573 | elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'): |
576 | 574 | with self._modify_checkpointable_tasks_lock: |
577 | 575 | self.checkpointable_tasks.append(task_record) |
@@ -1255,7 +1253,7 @@ def cleanup(self) -> None: |
1255 | 1253 |
|
1256 | 1254 | # TODO: accesses to self.checkpointable_tasks should happen |
1257 | 1255 | # under a lock? |
1258 | | - self.checkpoint(self.checkpointable_tasks) |
| 1256 | + self.memoizer.checkpoint(self.checkpointable_tasks) |
1259 | 1257 |
|
1260 | 1258 | if self._checkpoint_timer: |
1261 | 1259 | logger.info("Stopping checkpoint timer") |
@@ -1330,76 +1328,10 @@ def cleanup(self) -> None: |
1330 | 1328 |
|
1331 | 1329 | def invoke_checkpoint(self): |
1332 | 1330 | with self._modify_checkpointable_tasks_lock: |
1333 | | - r = self.checkpoint(self.checkpointable_tasks) |
| 1331 | + r = self.memoizer.checkpoint(self.checkpointable_tasks) |
1334 | 1332 | self.checkpointable_tasks = [] |
1335 | 1333 | return r |
1336 | 1334 |
|
1337 | | - def checkpoint(self, tasks: Sequence[TaskRecord]) -> str: |
1338 | | - """Checkpoint the dfk incrementally to a checkpoint file. |
1339 | | -
|
1340 | | - When called, every task that has been completed yet not |
1341 | | - checkpointed is checkpointed to a file. |
1342 | | -
|
1343 | | - Kwargs: |
1344 | | - - tasks (List of task records) : List of task ids to checkpoint. Default=None |
1345 | | - if set to None, we iterate over all tasks held by the DFK. |
1346 | | -
|
1347 | | - .. note:: |
1348 | | - Checkpointing only works if memoization is enabled |
1349 | | -
|
1350 | | - Returns: |
1351 | | - Checkpoint dir if checkpoints were written successfully. |
1352 | | - By default the checkpoints are written to the RUNDIR of the current |
1353 | | - run under RUNDIR/checkpoints/{tasks.pkl, dfk.pkl} |
1354 | | - """ |
1355 | | - with self.checkpoint_lock: |
1356 | | - checkpoint_queue = tasks |
1357 | | - |
1358 | | - checkpoint_dir = '{0}/checkpoint'.format(self.run_dir) |
1359 | | - checkpoint_dfk = checkpoint_dir + '/dfk.pkl' |
1360 | | - checkpoint_tasks = checkpoint_dir + '/tasks.pkl' |
1361 | | - |
1362 | | - if not os.path.exists(checkpoint_dir): |
1363 | | - os.makedirs(checkpoint_dir, exist_ok=True) |
1364 | | - |
1365 | | - with open(checkpoint_dfk, 'wb') as f: |
1366 | | - state = {'rundir': self.run_dir, |
1367 | | - 'task_count': self.task_count |
1368 | | - } |
1369 | | - pickle.dump(state, f) |
1370 | | - |
1371 | | - count = 0 |
1372 | | - |
1373 | | - with open(checkpoint_tasks, 'ab') as f: |
1374 | | - for task_record in checkpoint_queue: |
1375 | | - task_id = task_record['id'] |
1376 | | - |
1377 | | - app_fu = task_record['app_fu'] |
1378 | | - |
1379 | | - if app_fu.done() and app_fu.exception() is None: |
1380 | | - hashsum = task_record['hashsum'] |
1381 | | - if not hashsum: |
1382 | | - continue |
1383 | | - t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()} |
1384 | | - |
1385 | | - # We are using pickle here since pickle dumps to a file in 'ab' |
1386 | | - # mode behave like a incremental log. |
1387 | | - pickle.dump(t, f) |
1388 | | - count += 1 |
1389 | | - logger.debug("Task {} checkpointed".format(task_id)) |
1390 | | - |
1391 | | - self.checkpointed_tasks += count |
1392 | | - |
1393 | | - if count == 0: |
1394 | | - if self.checkpointed_tasks == 0: |
1395 | | - logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled") |
1396 | | - else: |
1397 | | - logger.debug("No tasks checkpointed in this pass.") |
1398 | | - else: |
1399 | | - logger.info("Done checkpointing {} tasks".format(count)) |
1400 | | - |
1401 | | - return checkpoint_dir |
1402 | | - |
1403 | 1335 | @staticmethod |
1404 | 1336 | def _log_std_streams(task_record: TaskRecord) -> None: |
1405 | 1337 | tid = task_record['id'] |
|
0 commit comments