Avoid loading opt_state and collections in evaluators.

The kauldron Authors · The kauldron Authors · commit 3bcf5642df47 · 2024-09-07T09:36:58.000-07:00
PiperOrigin-RevId: 672089006
diff --git a/kauldron/evals/eval_impl.py b/kauldron/evals/eval_impl.py
@@ -24,7 +24,6 @@
 from absl import logging
 from etils import epath
 from kauldron.checkpoints import checkpointer
-from kauldron.checkpoints import partial_loader
 from kauldron.evals import evaluators as evaluators_lib
 from kauldron.evals import run_strategies
 from kauldron.train import train_step
@@ -70,8 +69,6 @@ def continuous_eval(
   # In eval-only mode, the model weights are restored from the init_transforms
   # and not the checkpoint, so we cannot skip it.
   state = trainer.init_state(skip_transforms=not trainer.setup.eval_only)
-  # Remove optimizer state to avoid using additional memory.
-  state = state.replace(opt_state=None)
   aux = {eval_name: train_step.Auxiliaries() for eval_name in eval_names}
 
   # If preempted, the last checkpoint might be re-computed. There could be
@@ -169,7 +166,6 @@ def _preemptable_iter_new_checkpoints(
     return
 
   trainer_ckpt = trainer.checkpointer
-  assert isinstance(trainer_ckpt, checkpointer.Checkpointer)
   eval_ckpt = _get_eval_ckpt(trainer_ckpt, eval_names)
   # If the eval checkpoint exists, there is an ongoing eval that was preempted
   # and we should resume the onging eval.
@@ -179,11 +175,9 @@ def _preemptable_iter_new_checkpoints(
     logging.info('Resume evaluation...')
     # Restore the state from the last eval checkpoint
     state = eval_ckpt.restore(state)
-    step = int(state.step)
     yield state
-    # state might have been donated, we should not access it after this point.
     # Eval is done, remove the duplicated checkpoint
-    eval_ckpt.delete(step)
+    eval_ckpt.delete(state.step)
 
   for step in trainer_ckpt.iter_new_checkpoints(
       min_interval_secs=10,
@@ -195,27 +189,14 @@ def _preemptable_iter_new_checkpoints(
           .exists()
       ),
   ):
-    # TODO(epot): Rather than `PartialKauldronLoader`, should instead
-    # have some `trainer_ckpt.restore(state, partial_restore=True)`
-    # Only restore the params and step from the trainer checkpoint.
-    state = partial_loader.PartialKauldronLoader(
-        workdir=trainer_ckpt.workdir,
-        # Load everything except the optimizer state.
-        new_to_old={
-            f.name: f.name
-            for f in dataclasses.fields(state)
-            if f.name != 'opt_state'
-        },
-        step=step,
-    ).transform(state)
+    state = trainer_ckpt.restore(state, step=step)
     assert int(state.step) == step
     # Temporarily copy the state to the eval checkpoint, to ensure that
     # it won't be deleted by the train job until the current eval is done.
     eval_ckpt.save(state, step=step)
     yield state
-    # state might have been donated, we should not access it after this point.
     # Eval is done, remove the duplicated checkpoint
-    eval_ckpt.delete(step)
+    eval_ckpt.delete(state.step)
 
 
 def _get_eval_ckpt(