feat (experimental): log step-wise metrics

sehoffmann · sehoffmann · commit 2c4117cdc63e · 2025-03-03T20:30:09.000+01:00
diff --git a/dmlcloud/core/callbacks.py b/dmlcloud/core/callbacks.py
@@ -51,8 +51,8 @@ class TimedeltaFormatter:
     def __init__(self, microseconds=False):
         self.microseconds = microseconds
 
-    def __call__(self, value: torch.Tensor) -> str:
-        delta = timedelta(seconds=value.item())
+    def __call__(self, seconds: float) -> str:
+        delta = timedelta(seconds=seconds)
         if not self.microseconds:
             delta -= timedelta(microseconds=delta.microseconds)
         return str(delta)
@@ -240,14 +240,15 @@ def pre_epoch(self, stage: 'Stage'):
     def post_epoch(self, stage: 'Stage'):
         self.epoch_end_time = datetime.now()
 
-        stage.log('misc/epoch', stage.current_epoch, prefixed=False)
-        stage.log('misc/epoch_time', (stage.epoch_end_time - self.epoch_start_time).total_seconds(), prefixed=False)
-        stage.log('misc/total_time', (stage.epoch_end_time - self.start_time).total_seconds(), prefixed=False)
+        epoch_time = (stage.epoch_end_time - self.epoch_start_time).total_seconds()
+        total_time = (stage.epoch_end_time - self.start_time).total_seconds()
+        stage.log('misc/epoch_time', epoch_time, prefixed=False, log_step=False)
+        stage.log('misc/total_time', total_time, prefixed=False, log_step=False)
 
         if stage._run_epoch_overridden:
             average_epoch_time = (stage.epoch_end_time - self.start_time) / (stage.current_epoch + 1)
             eta = average_epoch_time * (stage.max_epochs - stage.current_epoch - 1)
-            stage.log('misc/eta', eta.total_seconds(), prefixed=False)
+            stage.log('misc/eta', eta.total_seconds(), prefixed=False, log_step=False)
 
 
 class TableCallback(Callback):
@@ -345,12 +346,42 @@ class ReduceMetricsCallback(Callback):
     A callback that reduces the metrics at the end of each epoch and appends them to the history.
     """
 
-    def post_epoch(self, stage: 'Stage'):
+    def __init__(self, log_every_n_steps=50):
+        self.log_every_n_steps = log_every_n_steps
+
+    def _reduce_epoch_metrics(self, stage):
         metrics = stage.metrics.reduce()
         stage.history.append_metrics(**metrics)
-        stage.history.next_step()
+
+    def _reduce_step_metrics(self, stage):
+        metrics = stage.step_metrics.reduce()
+        stage.step_history.append_metrics(**metrics)
+
+    def post_epoch(self, stage: 'Stage'):
+        stage.log('misc/epoch', stage.current_epoch, prefixed=False, reduction='max')
+        self._reduce_epoch_metrics(stage)
         stage.step = 0  # Reset the step counter
 
+    def post_step(self, stage: 'Stage'):
+        stage.log('misc/step', stage.global_step, prefixed=False, reduction='max')
+
+        if stage.global_step % self.log_every_n_steps == 0:
+            self._reduce_step_metrics(stage)
+
+        stage.step += 1
+        stage.global_step += 1
+
+    def post_stage(self, stage):
+        has_unreduced_metrics = False
+        for metric in stage.step_metrics.metrics.values():
+            if metric.update_called:
+                has_unreduced_metrics = True
+                break
+
+        # need to check global_step > 0 to avoid reducing when finish_step() was never called once
+        if has_unreduced_metrics and stage.global_step > 0:
+            self._reduce_step_metrics(stage)
+
 
 class CheckpointCallback(Callback):
     """
@@ -391,60 +422,61 @@ class CsvCallback(Callback):
     Saves metrics to a CSV file at the end of each epoch.
     """
 
-    def __init__(self, path: Union[str, Path], append_stage_name: bool = False):
+    def __init__(self, directory: Union[str, Path]):
         """
         Initialize the callback with the given path.
 
         Args:
-            path (Union[str, Path]): The file path where the callback will operate.
-            append_stage_name (bool, optional): Whether to append the stage name to the path. Defaults to False.
-        """
-        self.path = Path(path)
-        self.append_stage_name = append_stage_name
-
-    def csv_path(self, stage: 'Stage'):
+            directory (Union[str, Path]): The path to the directory where the CSV files will be saved.
         """
-        Generate the CSV file path for the given stage.
-
-        If `append_stage_name` is True, the method appends the stage name to the file name.
-        Otherwise, it returns the base path.
+        self.directory = Path(directory)
+        self.last_steps = {}
+
+    def _build_name(self, stage: 'Stage', prefix: str):
+        duplicate_stages = [s for s in stage.pipe.stages if s.name == stage.name]
+        idx = duplicate_stages.index(stage)
+        if len(duplicate_stages) > 1:
+            return self.directory / f'{prefix}_{stage.name}_{idx + 1}.csv'
+        else:
+            return self.directory / f'{prefix}_{stage.name}.csv'
 
-        Args:
-            stage (Stage): The stage object containing the name to be appended.
+    def epoch_path(self, stage: 'Stage'):
+        return self._build_name(stage, 'epoch_metrics')
 
-        Returns:
-            Path: The complete path to the CSV file.
-        """
-
-        if self.append_stage_name:
-            duplicate_stages = [s for s in stage.pipe.stages if s.name == stage.name]
-            idx = duplicate_stages.index(stage)
-            if len(duplicate_stages) > 1:
-                return self.path / f'metrics_{stage.name}_{idx + 1}.csv'
-            else:
-                return self.path / f'metrics_{stage.name}.csv'
-        else:
-            return self.path
+    def step_path(self, stage: 'Stage'):
+        return self._build_name(stage, 'step_metrics')
 
     def pre_stage(self, stage: 'Stage'):
         # If for some reason we can't write to the file or it exists already, its better to fail early
-        with open(self.csv_path(stage), 'x'):
+        with open(self.epoch_path(stage), 'x'):
             pass
 
-    def post_epoch(self, stage: 'Stage'):
-        with open(self.csv_path(stage), 'a') as f:
-            writer = csv.writer(f)
+    def _write_history(self, file, history, step_metric, step_name):
+        writer = csv.writer(file)
+
+        metric_names = list(history.keys())
+        metric_names.remove(step_metric)
 
-            metrics = stage.history.last()
+        writer.writerow([step_name] + metric_names)  # Header
+        for row in history.rows():
+            csv_row = [row[step_metric]] + [row[name] for name in metric_names]
+            writer.writerow(csv_row)
 
-            # Write the header if the file is empty
-            if f.tell() == 0:
-                writer.writerow(['epoch'] + list(metrics))
+    def _maybe_write_step_metrics(self, stage: 'Stage'):
+        if stage.step_history.num_steps > self.last_steps.get(stage, 0):
+            self.last_steps[stage] = stage.step_history.num_steps
+            with open(self.step_path(stage), 'w') as f:
+                self._write_history(f, stage.step_history, 'misc/step', 'step')
+
+    def post_epoch(self, stage: 'Stage'):
+        with open(self.epoch_path(stage), 'w') as f:
+            self._write_history(f, stage.history, 'misc/epoch', 'epoch')
 
-            row = [stage.current_epoch - 1]  # epoch is already incremented
-            for value in metrics.values():
-                row.append(value.item())
-            writer.writerow(row)
+    def post_step(self, stage: 'Stage'):
+        self._maybe_write_step_metrics(stage)
+
+    def post_stage(self, stage):
+        self._maybe_write_step_metrics(stage)  # edge case: last steps of training
 
 
 class WandbInitCallback(Callback):
@@ -523,7 +555,7 @@ def pre_run(self, pipe):
     def post_epoch(self, stage: 'Stage'):
         metrics = stage.history.last()
         for key, value in metrics.items():
-            self.writer.add_scalar(key, value.item(), stage.current_epoch)
+            self.writer.add_scalar(key, value, stage.current_epoch)
 
     def cleanup(self, pipe, exc_type, exc_value, traceback):
         if self.writer is not None:
diff --git a/dmlcloud/core/metrics.py b/dmlcloud/core/metrics.py
@@ -34,7 +34,6 @@ class TrainingHistory:
 
     def __init__(self):
         self.num_steps = 0
-        self._current_values = {}
         self._metrics = {}
         self._dtypes = {}
 
@@ -65,6 +64,10 @@ def values(self):
     def items(self):
         return [(name, self[name]) for name in self._metrics]
 
+    def rows(self):
+        for i in range(self.num_steps):
+            yield {name: self._metrics[name][i] for name in self._metrics}
+
     def append_metric(self, name: str, value: Union[ArrayLike, Any]):
         """
         Adds a value for a metric at the current step.
@@ -76,14 +79,6 @@ def append_metric(self, name: str, value: Union[ArrayLike, Any]):
         if name in self._current_values:
             raise ValueError(f'Metric {name} already has a value for step {self.num_steps}')
 
-        if name not in self._metrics and self.num_steps > 0:
-            raise ValueError(f'Cannot add metric {name} after the first step')
-
-        if isinstance(value, torch.Tensor):
-            value = value.detach().to('cpu', non_blocking=True)
-
-        self._current_values[name] = value
-
     def append_metrics(self, **metrics):
         """
         Adds multiple metrics at the current step.
@@ -92,28 +87,16 @@ def append_metrics(self, **metrics):
             **metrics: The metrics to add.
         """
         for name, value in metrics.items():
-            self.append_metric(name, value)
-
-    def next_step(self):
-        """
-        Advances the step counter.
-        """
-
-        for name in self._metrics:
-            if name not in self._current_values:
-                raise ValueError(f'Metric {name} does not have a value for step {self.num_steps}')
-
-        for name, value in self._current_values.items():
-            if type(value) == ArrayLike:  # noqa
-                value = np.as_array(value)
+            dtype = value.dtype if type(value) == ArrayLike else object  # noqa
+            if isinstance(value, torch.Tensor) or isinstance(value, np.ndarray):
+                value = value.item()
 
             if name not in self._metrics:
-                self._metrics[name] = [value]
-                self._dtypes[name] = value.dtype if type(value) == ArrayLike else object  # noqa
+                self._metrics[name] = ([None] * self.num_steps) + [value]
+                self._dtypes[name] = dtype
             else:
                 self._metrics[name].append(value)
 
-        self._current_values = {}
         self.num_steps += 1
 
     def last(self) -> dict[str, Any]:
@@ -126,16 +109,6 @@ def last(self) -> dict[str, Any]:
 
         return {name: values[-1] for name, values in self._metrics.items()}
 
-    def current(self) -> dict[str, Any]:
-        """
-        Returns the current, but not yet saved, value for each metric.
-
-        Returns:
-            dict[str, Any]: The current value for each metric.
-        """
-
-        return {name: self._current_values[name] for name in self._current_values}
-
     def min(self) -> dict[str, min_return_type]:
         """
         Returns a namedtuple (value, step) containing the minimum value and the corresponding step for each metric across all steps.
@@ -180,10 +153,12 @@ def log(self, name: str, value: Any, reduction: str = 'mean', **kwargs):
         if not torch.is_tensor(value):
             value = torch.tensor(value)
         value = value.cpu()
+        dtype = value.dtype
 
         if name not in self.metrics:
             if reduction == 'mean':
                 metric = torchmetrics.MeanMetric(**kwargs)
+                dtype = torch.float32
             elif reduction == 'sum':
                 metric = torchmetrics.SumMetric(**kwargs)
             elif reduction == 'min':
@@ -192,15 +167,20 @@ def log(self, name: str, value: Any, reduction: str = 'mean', **kwargs):
                 metric = torchmetrics.MaxMetric(**kwargs)
             elif reduction == 'cat':
                 metric = torchmetrics.CatMetric(**kwargs)
-            self.add_metric(name, metric.cpu())
+            metric = metric.cpu().set_dtype(dtype)
+            self.add_metric(name, metric)
 
         self.metrics[name].update(value)
 
-    def reduce(self):
+    def reduce(self, reset: bool = True):
         values = {}
         for name, metric in self.metrics.items():
-            values[name] = metric.compute()
-            metric.reset()
+            if metric.update_called:
+                values[name] = metric.compute()
+                if reset:
+                    metric.reset()
+            else:
+                values[name] = None
         return values
 
     def clear(self):
diff --git a/dmlcloud/core/pipeline.py b/dmlcloud/core/pipeline.py
@@ -229,7 +229,7 @@ def enable_checkpointing(
 
         if is_root():
             self.add_callback(CheckpointCallback(self.run_dir), CbPriority.CHECKPOINT)
-            self.add_callback(CsvCallback(self.run_dir, append_stage_name=True), CbPriority.CSV)
+            self.add_callback(CsvCallback(self.run_dir), CbPriority.CSV)
             self.add_callback(TensorboardCallback(self.run_dir), CbPriority.TENSORBOARD)
 
     def enable_wandb(
diff --git a/dmlcloud/core/stage.py b/dmlcloud/core/stage.py
@@ -60,7 +60,9 @@ def __init__(self, name: str = None, epochs: int | None = 1):
         self.pipe = None  # set by the pipeline
 
         self.history = TrainingHistory()
+        self.step_history = TrainingHistory()
         self.metrics = Tracker()
+        self.step_metrics = Tracker()
 
         self.step = 0
         self.global_step = 0
@@ -165,10 +167,12 @@ def add_callback(self, callback: 'Callback', priority: int = 1):
         """
         self.callbacks.append(callback, priority)
 
-    def log(self, name: str, value: Any, reduction: str = 'mean', prefixed: bool = True):
+    def log(self, name: str, value: Any, reduction: str = 'mean', prefixed: bool = True, log_step: bool = True):
         if prefixed and self.metric_prefix:
             name = f'{self.metric_prefix}/{name}'
         self.metrics.log(name, value, reduction)
+        if log_step:
+            self.step_metrics.log(name, value, reduction)
 
     def add_metric(self, name, metric):
         metric = metric.to(self.device)
@@ -299,8 +303,6 @@ def next_epoch(self):
 
     def finish_step(self):
         self._post_step()
-        self.step += 1
-        self.global_step += 1
 
     def run_epoch(self):
         """
diff --git a/examples/mnist.py b/examples/mnist.py
@@ -2,7 +2,6 @@
 
 import dmlcloud as dml
 import torch
-import torchmetrics
 from torch import nn
 from torch.utils.data import DataLoader
 from torchvision import datasets, transforms
@@ -55,9 +54,6 @@ def pre_stage(self):
         self.add_column('[Val] Loss', 'val/loss', color='cyan')
         self.add_column('[Val] Acc.', 'val/accuracy', formatter=lambda acc: f'{100 * acc:.2f}%', color='cyan')
 
-        self.train_acc = self.add_metric('train/accuracy', torchmetrics.Accuracy('multiclass', num_classes=10))
-        self.val_acc = self.add_metric('val/accuracy', torchmetrics.Accuracy('multiclass', num_classes=10))
-
     # The run_epoch method is called once per epoch
     def run_epoch(self):
         self._train_epoch()
@@ -78,8 +74,9 @@ def _train_epoch(self):
             self.optimizer.step()
 
             self.log('loss', loss)
-            # self.log('accuracy', (output.argmax(1) == target).float().mean())
-            self.train_acc(output, target)
+            self.log('accuracy', (output.argmax(1) == target).float().mean())
+
+            self.finish_step()  # optional, but useful to get step-wise metrics
 
     @torch.no_grad()
     def _val_epoch(self):
@@ -93,8 +90,7 @@ def _val_epoch(self):
             loss = self.loss(output, target)
 
             self.log('loss', loss)
-            # self.log('accuracy', (output.argmax(1) == target).float().mean())
-            self.val_acc(output, target)
+            self.log('accuracy', (output.argmax(1) == target).float().mean())
 
 
 def main():
diff --git a/test/test_csv.py b/test/test_csv.py