feat: add checkpointing functionality to coffea.processor.Runner (#1420)

pfackeldey · ikrommyd · lgray · web-flow · commit 918a5d7b1ecd · 2025-10-06T05:04:06.000-05:00
* add checkpointing functionality to coffea.processor.Runner

* fix annotation

* add test

* add check to 'save' step

* improve the LocalCheckpointer logic a bit

* improve test to not rerun preprocessing everytime

* use fsspec for load/save

* 'LocalCheckpointer' -&gt; 'SimpleCheckpointer'

* switch to fsspec path checking

* switch to rich print

* test with fsspec to be able to to offline tests on remote storages

* do not re-open open paths

* we don't need the token

---------

Co-authored-by: Iason Krommydas &lt;iason.krom@gmail.com&gt;
Co-authored-by: Lindsey Gray &lt;lindsey.gray@gmail.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,6 +62,7 @@ dependencies = [
   "cachetools",
   "requests",
   "aiohttp",
+  "fsspec",
 ]
 dynamic = ["version"]
 
diff --git a/src/coffea/processor/__init__.py b/src/coffea/processor/__init__.py
@@ -11,6 +11,7 @@
     set_accumulator,
     value_accumulator,
 )
+from .checkpointer import CheckpointerABC, SimpleCheckpointer
 from .executor import (
     DaskExecutor,
     FuturesExecutor,
@@ -29,6 +30,8 @@
     "ParslExecutor",
     "TaskVineExecutor",
     "Runner",
+    "CheckpointerABC",
+    "SimpleCheckpointer",
     "accumulate",
     "Accumulatable",
     "AccumulatorABC",
diff --git a/src/coffea/processor/checkpointer.py b/src/coffea/processor/checkpointer.py
@@ -0,0 +1,119 @@
+from __future__ import annotations
+
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+import cloudpickle
+import fsspec
+from rich import print
+
+if TYPE_CHECKING:
+    from coffea.processor import Accumulatable, ProcessorABC
+
+
+class CheckpointerABC(metaclass=ABCMeta):
+    """ABC for a generalized checkpointer
+
+    Checkpointers are used to save chunk outputs to disk, and reload them if the same chunk is processed again.
+    This is useful for long-running jobs that may be interrupted (resumable processing).
+
+    Examples
+    --------
+
+    >>> from datetime import datetime
+    >>> from coffea import processor
+    >>> from coffea.processor import SimpleCheckpointer
+
+    # create a checkpointer that stores checkpoints in a directory with the current date/time
+    # (you may want to use a more specific directory in practice)
+    >>> datestring = datetime.now().strftime("%Y%m%d%H")
+    >>> checkpointer = SimpleCheckpointer(checkpoint_dir=f"checkpoints/{datestring}", verbose=True)
+
+    # pass the checkpointer to a Runner
+    >>> run = processor.Runner(..., checkpointer=checkpointer)
+    >>> output = run(...)
+
+    After the run, the checkpoints will be stored in the directory ``checkpoints/{datestring}``. On a subsequent run,
+    if the same chunks are processed (and the same checkpointer, or rather ``checkpoint_dir`` is used),
+    the results will be loaded from disk instead of being recomputed.
+    """
+
+    @abstractmethod
+    def load(
+        self, metadata: Any, processor_instance: ProcessorABC
+    ) -> Accumulatable | None: ...
+
+    @abstractmethod
+    def save(
+        self, output: Accumulatable, metadata: Any, processor_instance: ProcessorABC
+    ) -> None: ...
+
+
+class SimpleCheckpointer(CheckpointerABC):
+    def __init__(
+        self,
+        checkpoint_dir: str,
+        verbose: bool = False,
+        overwrite: bool = True,
+    ) -> None:
+        fs, path = fsspec.url_to_fs(checkpoint_dir)
+        self.fs = fs
+        self.checkpoint_dir = path
+        self.verbose = verbose
+        self.overwrite = overwrite
+
+    def filepath(self, metadata: Any, processor_instance: ProcessorABC) -> str:
+        del processor_instance  # not used here, but could be in subclasses
+
+        # build a path from metadata, how to include 'metadata["filename"]'? Is it needed?
+        path = Path(self.checkpoint_dir)
+        path /= metadata["dataset"]
+        path /= metadata["fileuuid"]
+        path /= metadata["treename"]
+        path /= f"{metadata['entrystart']}-{metadata['entrystop']}.coffea"
+        return str(path)
+
+    def load(
+        self, metadata: Any, processor_instance: ProcessorABC
+    ) -> Accumulatable | None:
+        fs = self.fs
+        fpath = self.filepath(metadata, processor_instance)
+        if not fs.exists(fpath):
+            if self.verbose:
+                print(
+                    f"Checkpoint file {fpath} does not exist. May be the first run..."
+                )
+            return None
+        # else:
+        try:
+            with fs.open(fpath, "rb", compression="lz4") as fin:
+                output = cloudpickle.load(fin)
+            return output
+
+        except Exception as e:
+            if self.verbose:
+                print(f"Could not load checkpoint: {e}.")
+            return None
+
+    def save(
+        self, output: Accumulatable, metadata: Any, processor_instance: ProcessorABC
+    ) -> None:
+        fs = self.fs
+        fpath = self.filepath(metadata, processor_instance)
+        # ensure directory exists
+        fs.mkdirs(str(Path(fpath).parent), exist_ok=True)
+        if fs.exists(fpath) and not self.overwrite:
+            if self.verbose:
+                print(f"Checkpoint file {fpath} already exists. Not overwriting...")
+            return None
+        # else:
+        try:
+            with fs.open(fpath, "wb", compression="lz4") as fout:
+                output = cloudpickle.dump(output, fout)
+        except Exception as e:
+            if self.verbose:
+                print(
+                    f"Could not save checkpoint: {e}. Continuing without checkpointing..."
+                )
+        return None
diff --git a/src/coffea/processor/executor.py b/src/coffea/processor/executor.py
@@ -29,6 +29,7 @@
 from ..nanoevents import NanoEventsFactory, schemas
 from ..util import _exception_chain, _hash, deprecate, rich_bar
 from .accumulator import Accumulatable, accumulate, set_accumulator
+from .checkpointer import CheckpointerABC
 from .processor import ProcessorABC
 
 _PICKLE_PROTOCOL = pickle.HIGHEST_PROTOCOL
@@ -1038,6 +1039,8 @@ class Runner:
             determine chunking.  Defaults to a in-memory LRU cache that holds 100k entries
             (about 1MB depending on the length of filenames, etc.)  If you edit an input file
             (please don't) during a session, the session can be restarted to clear the cache.
+        checkpointer : CheckpointerABC, optional
+            A CheckpointerABC instance to manage checkpointing of each chunk output
     """
 
     executor: ExecutorBase
@@ -1054,6 +1057,7 @@ class Runner:
     use_skyhook: Optional[bool] = False
     skyhook_options: Optional[dict] = field(default_factory=dict)
     format: str = "root"
+    checkpointer: Optional[CheckpointerABC] = None
 
     @staticmethod
     def read_coffea_config():
@@ -1399,6 +1403,7 @@ def _work_function(
         processor_instance: ProcessorABC,
         uproot_options: dict,
         iteritems_options: dict,
+        checkpointer: CheckpointerABC,
     ) -> dict:
         if "timeout" in uproot_options:
             xrootdtimeout = uproot_options["timeout"]
@@ -1407,6 +1412,28 @@ def _work_function(
         if not isinstance(processor_instance, ProcessorABC):
             processor_instance = cloudpickle.loads(lz4f.decompress(processor_instance))
 
+        metadata = {
+            "dataset": item.dataset,
+            "filename": item.filename,
+            "treename": item.treename,
+            "entrystart": item.entrystart,
+            "entrystop": item.entrystop,
+            "fileuuid": (
+                str(uuid.UUID(bytes=item.fileuuid)) if len(item.fileuuid) > 0 else ""
+            ),
+        }
+        if item.usermeta is not None:
+            metadata.update(item.usermeta)
+
+        if checkpointer is not None:
+            if not isinstance(checkpointer, CheckpointerABC):
+                raise TypeError("Expected checkpointer to derive from CheckpointerABC")
+            # try to load from checkpoint
+            out = checkpointer.load(metadata, processor_instance)
+            # if we got something, return it
+            if out is not None:
+                return out
+
         try:
             if format == "root":
                 filecontext = uproot.open(
@@ -1421,19 +1448,6 @@ def _work_function(
                 f"Failed to open file: {item!r}. The error was: {e!r}."
             ) from e
 
-        metadata = {
-            "dataset": item.dataset,
-            "filename": item.filename,
-            "treename": item.treename,
-            "entrystart": item.entrystart,
-            "entrystop": item.entrystop,
-            "fileuuid": (
-                str(uuid.UUID(bytes=item.fileuuid)) if len(item.fileuuid) > 0 else ""
-            ),
-        }
-        if item.usermeta is not None:
-            metadata.update(item.usermeta)
-
         with filecontext as file:
             if schema is None:
                 raise ValueError("Schema must be set")
@@ -1479,9 +1493,7 @@ def _work_function(
                     "Output of process() should not be None. Make sure your processor's process() function returns an accumulator."
                 )
             toc = time.time()
-            if use_dataframes:
-                return out
-            else:
+            if not use_dataframes:
                 if savemetrics:
                     metrics = {}
                     if isinstance(file, uproot.ReadOnlyDirectory):
@@ -1490,8 +1502,17 @@ def _work_function(
                         metrics["columns"] = set(materialized)
                         metrics["entries"] = len(events)
                     metrics["processtime"] = toc - tic
-                    return {"out": out, "metrics": metrics, "processed": {item}}
-                return {"out": out, "processed": {item}}
+                    out = {"out": out, "metrics": metrics, "processed": {item}}
+                out = {"out": out, "processed": {item}}
+
+            if checkpointer is not None:
+                if not isinstance(checkpointer, CheckpointerABC):
+                    raise TypeError(
+                        "Expected checkpointer to derive from CheckpointerABC"
+                    )
+                # save the output
+                checkpointer.save(out, metadata, processor_instance)
+            return out
 
     def __call__(
         self,
@@ -1661,6 +1682,7 @@ def run(
                 processor_instance="heavy",
                 uproot_options=uproot_options,
                 iteritems_options=iteritems_options,
+                checkpointer=self.checkpointer,
             )
         else:
             closure = partial(
@@ -1673,6 +1695,7 @@ def run(
                 processor_instance=pi_to_send,
                 uproot_options=uproot_options,
                 iteritems_options=iteritems_options,
+                checkpointer=self.checkpointer,
             )
 
         chunks = list(chunks)
diff --git a/tests/test_checkpointing.py b/tests/test_checkpointing.py
@@ -0,0 +1,87 @@
+import os.path as osp
+import random
+from pathlib import Path
+
+import awkward as ak
+import fsspec
+import numpy as np
+
+from coffea import processor
+from coffea.nanoevents import schemas
+
+# we want repeatable failures, and know that we never run indefinitely
+random.seed(1234)
+
+
+class UnstableNanoEventsProcessor(processor.ProcessorABC):
+    @property
+    def accumulator(self):
+        return {"cutflow": {}}
+
+    def process(self, events):
+        if random.random() < 0.5:
+            raise RuntimeError("Random failure for testing checkpointing")
+
+        output = self.accumulator
+        dataset = events.metadata["dataset"]
+        output["cutflow"]["%s_pt" % dataset] = ak.sum(ak.num(events.Muon, axis=1))
+        return output
+
+    def postprocess(self, accumulator):
+        return accumulator
+
+
+def test_checkpointing():
+    filelist = {
+        "ZJets": {
+            "treename": "Events",
+            "files": [osp.abspath("tests/samples/nano_dy.root")],
+        },
+        "Data": {
+            "treename": "Events",
+            "files": [osp.abspath("tests/samples/nano_dimuon.root")],
+        },
+    }
+
+    executor = processor.IterativeExecutor()
+
+    checkpoint_dir = str(Path(__file__).parent / "test_checkpointing")
+    # checkpoint_dir = "root://cmseos.fnal.gov//store/user/ikrommyd/test"
+    checkpointer = processor.SimpleCheckpointer(checkpoint_dir)
+    run = processor.Runner(
+        executor=executor,
+        schema=schemas.NanoAODSchema,
+        chunksize=10,
+        format="root",
+        checkpointer=checkpointer,
+    )
+    # use the chunk generator to not re-run the preprocessing step
+    chunks = list(run.preprocess(filelist, "Events"))
+
+    def chunk_gen():
+        yield from chunks
+
+    # number of WorkItems
+    n_expected_checkpoints = len(chunks)
+    ntries = 0
+    fs, path = fsspec.url_to_fs(checkpoint_dir)
+
+    # keep trying until we have as many checkpoints as WorkItems
+    while len(list(filter(fs.isfile, fs.glob(f"{path}/**")))) != n_expected_checkpoints:
+        fs.invalidate_cache()
+        ntries += 1
+        try:
+            out = run(chunk_gen(), UnstableNanoEventsProcessor(), "Events")
+        except Exception:
+            print(f"Run failed, trying again, try number {ntries}...")
+            continue
+
+    # make sure we have as many checkpoints as WorkItems
+    fs.invalidate_cache()
+    assert len(list(filter(fs.isfile, fs.glob(f"{path}/**")))) == n_expected_checkpoints
+
+    # make sure we got the right answer
+    assert out == {"cutflow": {"Data_pt": np.int64(84), "ZJets_pt": np.int64(18)}}
+
+    # cleanup
+    fs.rm(path, recursive=True)

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@ dependencies = [`
`62`	`62`	`"cachetools",`
`63`	`63`	`"requests",`
`64`	`64`	`"aiohttp",`
	`65`	`+ "fsspec",`
`65`	`66`	`]`
`66`	`67`	`dynamic = ["version"]`
`67`	`68`