feat: add numpyro checkpointing

davecwright3 · davecwright3 · commit cea28b2eca22 · 2025-09-05T17:19:29.000-04:00
also allows users to resume sampling
diff --git a/src/discovery/samplers/numpyro.py b/src/discovery/samplers/numpyro.py
@@ -1,12 +1,17 @@
 import inspect
+import pickle
+from pathlib import Path
 
+import jax
+import jax.numpy as jnp
 import pandas as pd
 
 import numpyro
 from numpyro import infer
 from numpyro import distributions as dist
 
 from .. import prior
+from ..pulsar import save_chain
 
 
 def makemodel_transformed(mylogl, transform=prior.makelogtransform_uniform, priordict={}):
@@ -48,3 +53,100 @@ def makesampler_nuts(numpyro_model, num_warmup=512, num_samples=1024, num_chains
     sampler.to_df = lambda: numpyro_model.to_df(sampler.get_samples())
 
     return sampler
+
+def run_nuts_with_checkpoints(
+    sampler,
+    num_samples_per_checkpoint,
+    rng_key,
+    outdir="chains",
+    resume=False,
+):
+    """Run NumPyro MCMC and save checkpoints.
+
+    This function performs multiple iterations of MCMC sampling, saving checkpoints
+    after each iteration. It saves samples to feather files and the NumPyro MCMC
+    state to JSON.
+
+    Parameters
+    ----------
+    sampler : numpyro.infer.MCMC
+        A NumPyro MCMC sampler object.
+    num_samples_per_checkpoint : int
+        The number of samples to save in each checkpoint.
+    rng_key : jax.random.PRNGKey
+        The random number generator key for JAX.
+    outdir : str | Path
+        The directory for output files.
+    resume : bool
+        Whether to look for a state to resume from.
+
+    Returns
+    -------
+    None
+        This function doesn't return any value but saves the results to disk.
+
+    Side Effects
+    ------------
+    - Runs the MCMC sampler for the number of iterations required to reach the total sample number.
+    - Saves samples data to feather files after each iteration.
+    - Writes the NumPyro sampler state to a pickle file after each iteration.
+
+    Example
+    -------
+    >>> import discovery.samplers.numpyro as ds_numpyro
+    >>> # Assume `model` is configured
+    >>> npsampler = ds_numpyro.makesampler_nuts(model, num_samples =100, num_warmup=50)
+    >>> ds_numpyro.run_nuts_with_checkpoints(npsampler, 10, jax.random.key(42))
+
+    """
+    # convert to pathlib object
+    # make directory if it doesn't exist
+    if not isinstance(outdir, Path):
+        outdir = Path(outdir)
+        outdir.mkdir(exist_ok=True, parents=True)
+
+    samples_file = outdir / "numpyro-samples.feather"
+    checkpoint_file = outdir / "numpyro-checkpoint.pickle"
+
+    if checkpoint_file.is_file() and samples_file.is_file() and resume:
+        df = pd.read_feather(samples_file)
+        num_samples_saved = df.shape[0]
+
+        with checkpoint_file.open("rb") as f:
+            checkpoint = pickle.load(f)
+
+        total_sample_num = sampler.num_samples - num_samples_saved
+
+        sampler.post_warmup_state = checkpoint
+
+    else:
+        df = None
+        num_samples_saved = 0
+        total_sample_num = sampler.num_samples
+
+    num_checkpoints = int(jnp.ceil(total_sample_num / num_samples_per_checkpoint))
+    remainder_samples = int(total_sample_num % num_samples_per_checkpoint)
+
+    for checkpoint in range(num_checkpoints):
+        if checkpoint == 0:
+            sampler.num_samples = num_samples_per_checkpoint
+            sampler._set_collection_params()  # Need this to update num_samples
+        elif checkpoint == num_checkpoints - 1:
+            # We won't need to update the collection params because we've set the post warmup state,
+            # and that accomplishes the same goal.
+            sampler.num_samples = remainder_samples if remainder_samples != 0 else num_samples_per_checkpoint
+
+        sampler.run(rng_key)
+
+        df_new = sampler.to_df()
+
+        df = pd.concat([df, df_new]) if df is not None else df_new
+
+        save_chain(df, samples_file)
+
+        with checkpoint_file.open("wb") as f:
+            pickle.dump(sampler.last_state, f)
+
+        sampler.post_warmup_state = sampler.last_state
+
+        rng_key, _ = jax.random.split(rng_key)