PurdueDigitalTwin · juanwulu · Dec 2, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
@@ -212,3 +212,5 @@ cython_debug/
 /data/
 /logs/
 requirements_*.txt
+
+.specstory/
@@ -61,9 +61,15 @@ pip.parse(
     python_version = "3.10",
     requirements_lock = "//third_party:requirements_3_10_tpu_lock.txt",
 )
+pip.parse(
+    hub_name = "ml_infra_mps_3_10",
+    python_version = "3.10",
+    requirements_lock = "//third_party:requirements_3_10_mps_lock.txt",
+)
 use_repo(
     pip,
     ml_infra_cpu_3_10 = "ml_infra_cpu_3_10",
     ml_infra_cuda_3_10 = "ml_infra_cuda_3_10",
+    ml_infra_mps_3_10 = "ml_infra_mps_3_10",
     ml_infra_tpu_3_10 = "ml_infra_tpu_3_10",
 )
@@ -8,7 +8,7 @@ ml_py_library(
     deps = [
         "fiddle",
         "optax",
-        ":data",
+        ":datamodule",
         ":model",
     ],
 )
@@ -24,7 +24,7 @@ ml_py_library(
     deps = [
         "clu",
         "jax",
-        ":data",
+        ":datamodule",
         ":model",
         "//src/utilities:logging",
     ],
@@ -36,8 +36,8 @@ ml_py_library(
     deps = [
         "chex",
         "flax",
+        "jax",
         "jaxtyping",
-        ":train_state",
     ],
 )
 
@@ -60,7 +60,7 @@ ml_py_library(
         "flax",
         "jax",
         "jaxtyping",
-        ":data",
+        ":datamodule",
         ":model",
         ":train_state",
         "//src/utilities:logging",

@@ -4,7 +4,7 @@
 import fiddle as fdl
 import optax
 
-from src.core import data as _data
+from src.core import datamodule as _datamodule
 from src.core import model as _model
 
 
@@ -20,7 +20,7 @@ class DataConfig:
         drop_remainder (bool): Whether to drop the last incomplete batch.
     """
 
-    module: fdl.Partial[_data.DataModule]
+    module: fdl.Partial[_datamodule.DataModule]
     batch_size: int = 32
     num_workers: int = 4
     deterministic: bool = True
@@ -45,6 +45,8 @@ class TrainerConfig:
 
     Attributes:
         num_train_steps (int): Total number of training steps.
+        checkpoint_every_n_steps (Optional[int]): Frequency of checkpointing.
+            If `None`, defaults to `eval_every_n_steps`.
         log_every_n_steps (int): Frequency of logging training metrics.
         eval_every_n_steps (int): Frequency of evaluation during training.
         checkpoint_dir (Optional[str]): Directory of checkpoint to resume from.
@@ -53,6 +55,7 @@ class TrainerConfig:
     """
 
     num_train_steps: int = 10_000
+    checkpoint_every_n_steps: typing.Optional[int] = None
     log_every_n_steps: int = 50
     eval_every_n_steps: int = 1_000
     checkpoint_dir: typing.Optional[str] = None

@@ -1,20 +1,22 @@
 import collections
 import functools
+import traceback
 import typing
 
 from clu import metric_writers
 from clu import periodic_actions
 import jax
+from jax import numpy as jnp
 import jaxtyping
 
-from src.core import data as _data
+from src.core import datamodule as _datamodule
 from src.core import model as _model
 from src.utilities import logging
 
 
 def run(
-    model: _model.Model,
-    datamodule: _data.DataModule,
+    datamodule: _datamodule.DataModule,
+    evaluation_step: typing.Callable[..., _model.StepOutputs],
     params: jaxtyping.PyTree,
     writer: metric_writers.MetricWriter,
     work_dir: str,
@@ -24,8 +26,8 @@ def run(
     """Runs evaluation loop with the given model and datamodule.
 
     Args:
-        model (Model): The model to evaluate.
         datamodule (DataModule): The datamodule providing the evaluation data.
+        evaluation_step (Callable): The pmapped evaluation step function.
         params (PyTree): The model parameters to use for evaluation.
         writer (MetricWriter): The metric writer for logging evaluation metrics.
         work_dir (str): The working directory for saving outputs.
@@ -36,11 +38,11 @@ def run(
         Integer status code (0 for success).
     """
     _status = 0
-    logging.rank_zero_debug(f"running {model.__class__.__name__} eval...")
 
-    eval_rng = jax.random.fold_in(rng, jax.process_index())
-    p_evaluation_step = functools.partial(model.evaluation_step, rng=eval_rng)
+    logging.rank_zero_info("Compiling evaluation step...")
+    p_evaluation_step = functools.partial(evaluation_step, rng=rng)
     p_evaluation_step = jax.pmap(p_evaluation_step, axis_name="batch")
+    logging.rank_zero_info("Compiling evaluation step...DONE!")
 
     hooks = []
     if jax.process_index() == 0:
@@ -69,7 +71,7 @@ def run(
                     batch,
                 )
                 with jax.profiler.StepTraceAnnotation(
-                    name="train",
+                    name="evaluation",
                     step_num=step,
                 ):
                     outputs = p_evaluation_step(
@@ -85,38 +87,52 @@ def run(
 
                 # logging at the end of batch
                 if outputs.scalars is not None:
-                    _scalars = {}
-                    for k, v in outputs.scalars.items():
-                        eval_metrics[k].append(jax.device_get(v).mean())
-                        _scalars[
-                            f"eval/{k.replace('_', ' ')}"
-                        ] = jax.device_get(v).mean()
                     writer.write_scalars(
-                        step=step + 1,
-                        scalars=_scalars,
+                        step=step,
+                        scalars={
+                            f"eval/{k}_step": sum(v) / len(v)
+                            for k, v in outputs.scalars.items()
+                        },
                     )
                 if outputs.images is not None:
                     writer.write_images(
-                        step=step + 1,
-                        images=outputs.images,
+                        step=step,
+                        images={
+                            f"eval/{k}_step": v
+                            for k, v in outputs.images.items()
+                        },
                     )
+                if outputs.histograms is not None:
+                    writer.write_histograms(
+                        step=step,
+                        arrays={
+                            f"eval/{k}_step": v
+                            for k, v in outputs.histograms.items()
+                        },
+                    )
+                writer.flush()
 
             # logging at the end of evaluation
             logging.rank_zero_info("Evaluation done.")
             scalar_output = {
-                f"eval/{k.replace('_', ' ')}": sum(v) / len(v)
+                f"eval/{k.replace('_', ' ')}_epoch": sum(v) / len(v)
                 for k, v in eval_metrics.items()
             }
             writer.write_scalars(
                 step=step,
                 scalars=scalar_output,
             )
+            writer.flush()
+
         except Exception as e:
             logging.rank_zero_error(
                 "Exception occurred during evaluation: %s", e
             )
+            error_trace = traceback.format_exc()
+            logging.rank_zero_error("Stack trace:\n%s", error_trace)
             _status = 1
         finally:
+            writer.close()
             logging.rank_zero_info(
                 "Evaluation done. Exit with code %d.",
                 _status,

@@ -2,23 +2,27 @@
 import typing
 
 import chex
-from flax import struct
+from flax.core import frozen_dict
+import jax
 import jaxtyping
 
-from src.core import train_state as _train_state
-
 
 @chex.dataclass
 class StepOutputs:
     """A base container for outputs from a single step.
 
     Attributes:
+        output (Optional[jax.Array]): The main output of the model.
         scalars (Optional[Dict[str, Any]]): A dictionary of scalar metrics.
         images (Optional[Dict[str, Any]]): A dictionary of image outputs.
+        histograms (Optional[Dict[str, Array]]): A dictionary of array to
+            plot as histograms.
     """
 
+    output: typing.Optional[jax.Array] = None
     scalars: typing.Optional[typing.Dict[str, typing.Any]] = None
     images: typing.Optional[typing.Dict[str, typing.Any]] = None
+    histograms: typing.Optional[typing.Dict[str, jax.Array]] = None
 
 
 class Model(abc.ABC):
@@ -51,67 +55,45 @@ def init(
         pass
 
     @abc.abstractmethod
-    def training_step(
+    def compute_loss(
         self,
         *,
-        state: _train_state.TrainState,
-        batch: typing.Any,
-        rngs: typing.Union[typing.Any, typing.Dict[str, typing.Any]],
+        rngs: typing.Any,
+        deterministic: bool = False,
+        params: frozen_dict.FrozenDict,
         **kwargs,
-    ) -> typing.Tuple[struct.PyTreeNode, StepOutputs]:
-        r"""Performs a single training step.
+    ) -> typing.Tuple[jax.Array, StepOutputs]:
+        """Computes the loss given parameters and model inputs.
 
         Args:
-            state (TrainState): The current training state.
-            batch (Any): A batch of data.
-            rngs (Union[Any, Dict[str, Any]]): Random generators.
-            **kwargs: Additional keyword arguments.
+            deterministic (bool): Whether to run the model in deterministic
+                mode (e.g., disable dropout). Default is `False`.
+            params (FrozenDict): The model parameters.
+            **kwargs: Keyword arguments consumed by the model.
 
         Returns:
-            A tuple containing the updated state and step outputs.
+            A dictionary containing the loss and other outputs.
         """
-        pass
+        raise NotImplementedError
 
     @abc.abstractmethod
-    def evaluation_step(
+    def forward(
         self,
         *,
-        params: jaxtyping.PyTree,
-        batch: typing.Any,
-        rngs: typing.Union[typing.Any, typing.Dict[str, typing.Any]],
+        rngs: typing.Any,
+        deterministic: bool = True,
+        params: frozen_dict.FrozenDict,
         **kwargs,
     ) -> StepOutputs:
-        r"""Performs a single evaluation step.
-
-        Args:
-            params (PyTree): The model parameters.
-            batch (Any): A batch of data.
-            rngs (Union[Any, Dict[str, Any]]): Random generators.
-            **kwargs: Additional keyword arguments.
-
-        Returns:
-            The step outputs containing evaluation metrics.
-        """
-        pass
-
-    @abc.abstractmethod
-    def predict_step(
-        self,
-        *,
-        params: jaxtyping.PyTree,
-        batch: typing.Any,
-        rngs: typing.Union[typing.Any, typing.Dict[str, typing.Any]],
-        **kwargs,
-    ) -> typing.Any:
-        r"""Performs a single prediction step during inference.
+        """Forward pass the model and returns the output tree structure.
 
         Args:
-            params (PyTree): The model parameters.
-            batch (Any): A batch of data.
-            rngs (Union[Any, Dict[str, Any]]): Random generators.
-            **kwargs: Additional keyword arguments.
+            deterministic (bool): Whether to run the model in deterministic
+                mode (e.g., disable dropout). Default is `True`.
+            params (FrozenDict): The model parameters.
+            **kwargs: Keyword arguments consumed by the model.
 
         Returns:
-            The model's predictions.
+            The model outputs.
         """
-        pass
+        raise NotImplementedError
-Original file line number
+Diff line change
@@ Expand Up / @@ -212,3 +212,5 @@ cython_debug/ @@
     /data/
     /logs/
     requirements_*.txt
+    .specstory/