fix tests

Felipe Mello · Felipe Mello · commit 7838dc4101b5 · 2025-10-21T19:01:01.000-07:00
diff --git a/apps/sft/main.py b/apps/sft/main.py
@@ -78,7 +78,6 @@ def __init__(self, config: DictConfig):
 
         self.current_step = 0
         self.num_training_steps = job_config.training.steps
-        self.metric_logger = None  # TODO: fix this
         self.gradient_accumulation_steps = 1  # Example value, adjust as needed
         self._rank = current_rank().rank
         self._size = math.prod(current_size().values())
diff --git a/src/forge/data/metric_transform.py b/src/forge/data/metric_transform.py
@@ -6,11 +6,10 @@
 
 from typing import Any
 
-from forge.interfaces import Transform
 from forge.observability.metrics import Metric, Reduce
 
 
-class MetricTransform(Transform):
+class MetricTransform:
     """
     Base class for transforms that collect observability metrics from dataset samples.
 
@@ -71,7 +70,7 @@ def __call__(self, sample: dict[str, Any]) -> dict[str, Any]:
         if "metrics" not in sample:
             sample["metrics"] = []
 
-        source_name = self.source or "dataset"
+        source_name = self.source or "unnamed_ds"
 
         # Add samples_processed metric
         sample["metrics"].append(
diff --git a/tests/unit_tests/datasets/test_hf.py b/tests/unit_tests/datasets/test_hf.py
@@ -181,9 +181,10 @@ def create_loader():
         assert (
             orig_post_ids == resumed_ids
         ), "Resumed batches should be identical for deterministic run"
+
         assert (
-            result["final_metrics"] == result["resumed_metrics"]
-        ), "Final metrics should match"
+            result["post_checkpoint_metrics"] == result["resumed_metrics"]
+        ), "Resumed training should produce same metrics as original training"
 
     def test_shuffling_behavior(self, dataset_factory, small_dataset_file):
         """Tests that shuffling changes data order between epochs but preserves the set of samples."""
diff --git a/tests/unit_tests/datasets/test_interleaved.py b/tests/unit_tests/datasets/test_interleaved.py
@@ -401,14 +401,16 @@ def create_interleaved():
             resume_dataloader=loader2,
         )
 
+        # Verify checkpointing and resumption work correctly
+        # After loading a checkpoint, training should continue identically
         orig_post_ids = [b["id"].tolist() for b in result["post_checkpoint_batches"]]
         resumed_ids = [b["id"].tolist() for b in result["resumed_batches"]]
         assert (
             orig_post_ids == resumed_ids
         ), "Resumed batches should be identical for deterministic run"
         assert (
-            result["final_metrics"] == result["resumed_metrics"]
-        ), "Final metrics should match"
+            result["post_checkpoint_metrics"] == result["resumed_metrics"]
+        ), "Resumed training should produce same metrics as original training"
 
         # Test sampling log functionality
         # Check that sampling log contains tuples of (iteration_count, dataset_name)
@@ -581,8 +583,8 @@ def create_dataloader(dataset):
                 f"This indicates sampling state is not properly preserved."
             )
             assert (
-                result["final_metrics"] == result["resumed_metrics"]
-            ), "Final metrics don't match resumed metrics - aggregator state issue"
+                result["post_checkpoint_metrics"] == result["resumed_metrics"]
+            ), "Resumed training should produce same metrics as original training"
 
             # Verify sampling ratio is approximately maintained for nested structure
             all_ids = []
diff --git a/tests/unit_tests/datasets/test_iterable_utils.py b/tests/unit_tests/datasets/test_iterable_utils.py
@@ -101,6 +101,9 @@ def generate_ckpt(
     pre_checkpoint_batches = batches[:steps_before_checkpoint]
     post_checkpoint_batches = batches[steps_before_checkpoint:]
 
+    # Compute metrics for post-checkpoint batches only
+    post_checkpoint_metrics = all_metrics[len(checkpoint_metrics) :]
+
     # Resume with new instance if provided
     resumed_batches = []
     resumed_metrics = []
@@ -127,24 +130,28 @@ def generate_ckpt(
         # Original run
         "pre_checkpoint_batches": pre_checkpoint_batches,
         "post_checkpoint_batches": post_checkpoint_batches,
-        "metrics_at_checkpoint": keep_last_metric(checkpoint_metrics),
-        "final_metrics": keep_last_metric(all_metrics),
+        "metrics_at_checkpoint": aggregate_metrics(checkpoint_metrics),
+        "post_checkpoint_metrics": aggregate_metrics(post_checkpoint_metrics),
+        "final_metrics": aggregate_metrics(all_metrics),
         # Resumed run
         "resumed_batches": resumed_batches,
-        "resumed_metrics": keep_last_metric(resumed_metrics),
+        "resumed_metrics": aggregate_metrics(resumed_metrics),
         # Internal state for loading - only if someone needs to manually load
         "_checkpoint_state": checkpoint_state,
     }
 
 
-def keep_last_metric(metrics_list: list) -> dict[str, Any]:
-    result = {}
+def aggregate_metrics(metrics_list: list) -> dict[str, Any]:
+    """Aggregate metrics according to their reduction types (SUM, MEAN, MAX, MIN, STD)."""
+    if not metrics_list:
+        return {}
+
+    accumulators = {}
+
     for metric in metrics_list:
-        # Expect observability.Metric objects only
         key = metric.key
-        value = metric.value
-
-        # For test purposes, just keep the last value of each metric
-        result[key] = value
+        if key not in accumulators:
+            accumulators[key] = metric.reduction.accumulator_class(metric.reduction)
+        accumulators[key].append(metric.value)
 
-    return result
+    return {key: acc.get_value() for key, acc in accumulators.items()}