debug; blocked by wandb table upload bug

DNXie · DNXie · commit 675a896700bb · 2025-10-04T22:35:00.000-07:00
diff --git a/apps/grpo/main.py b/apps/grpo/main.py
@@ -425,7 +425,6 @@ async def continuous_rollouts():
                 await replay_buffer.add.call_one(episode)
                 record_episode_sample("rollout/sample", episode)
 
-            record_metric("sample/", {}, Reduce.SAMPLE)
             # Log metrics
             rollout_count += 1
             record_metric(
diff --git a/src/forge/observability/metric_actors.py b/src/forge/observability/metric_actors.py
@@ -313,14 +313,17 @@ async def flush(self, step: int):
                 return
 
             # Reduce
-            reduced_metrics = reduce_metrics_states(all_local_states)
+            reduced_metrics, reduced_samples = reduce_metrics_states(all_local_states)
 
             # Log to each global logger_backend
             for (
                 logger_backend_name,
                 logger_backend,
             ) in self.global_logger_backends.items():
-                await logger_backend.log(reduced_metrics, step)
+                if reduced_metrics:
+                    await logger_backend.log(reduced_metrics, step)
+                if reduced_samples:
+                    await logger_backend.log_samples(reduced_samples, step)
 
     @endpoint
     def has_fetcher(self, name: str | ProcMesh) -> bool:
diff --git a/src/forge/observability/metrics.py b/src/forge/observability/metrics.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import heapq
+import itertools
 import logging
 
 import os
@@ -145,37 +146,54 @@ def record_episode_sample(key: str, episode):
     record_metric(key, sample, Reduce.SAMPLE)
 
 
-def reduce_metrics_states(states: List[Dict[str, Dict[str, Any]]]) -> Dict[str, Any]:
-    """Reduce metric accumulators states to a single value per metric.
+def reduce_metrics_states(
+    states: List[Dict[str, Dict[str, Any]]]
+) -> tuple[Dict[str, Any], Dict[str, list[dict]]]:
+    """
+    Reduce metric accumulator states across ranks into two groups:
+    - scalar metrics (mean/sum/etc.)
+    - sample metrics (list[dict])
 
-    Can be used when reducing metrics across ranks or services, as merging
-    states is more precise than merging locally reduced metrics.
+    This function merges metric accumulator states from multiple ranks or processes
+    into final reduced values. It automatically distinguishes between scalar reductions
+    (e.g., MEAN, SUM) and structured SAMPLE-type reductions (e.g., per-example dicts).
 
     Args:
         states (List[Dict[str, Dict[str, Any]]]): List of state of one or more metrics,
             normally retrieved using `forge.observability.metrics.MetricAccumulator.get_state()`.
 
     Returns:
-        Dict[str, Any]: Dictionary with format {metric_key: reduced_value}
+        metrics: Dict[str, Any], {metric_key: reduced_scalar_value}
+        samples: Dict[str, list[dict]], {metric_key: merged_list_of_samples}
 
     Example:
-        states = [
-            {"loss": {"count": 5, "sum": 14, "reduction_type": Reduce.MEAN}},
-            {"loss": {"count": 10, "sum": 16, "reduction_type": Reduce.MEAN}},
-        ]
-        reduce_metrics_states(states)
-        >>> {"loss": 2.0}
+        >>> states = [
+        ...     {
+        ...         "loss": {"count": 5, "sum": 14, "reduction_type": "mean"},
+        ...         "rollout/sample": {"reduction_type": "sample", "samples": [{"id": 1}]},
+        ...     },
+        ...     {
+        ...         "loss": {"count": 10, "sum": 26, "reduction_type": "mean"},
+        ...         "rollout/sample": {"reduction_type": "sample", "samples": [{"id": 2}]},
+        ...     },
+        ... ]
+        >>> metrics, samples = reduce_metrics_states(states)
+        >>> metrics
+        {'loss': 2.6666666666666665}
+        >>> samples
+        {'rollout/sample': [{'id': 1}, {'id': 2}]}
 
     Raises:
         ValueError: on mismatched reduction types for the same metric key.
     """
     if not states:
-        return {}
+        return {}, {}
 
     # Collect unique keys across all
     all_keys = set(k for state in states for k in state)
+    metrics: Dict[str, Any] = {}
+    samples: Dict[str, list[dict]] = {}
 
-    reduced_metrics = {}
     for key in all_keys:
         metric_states = [state.get(key) for state in states if key in state]
         if not metric_states:
@@ -194,9 +212,14 @@ def reduce_metrics_states(states: List[Dict[str, Dict[str, Any]]]) -> Dict[str,
 
         metric_accumulator = Reduce(first_reduction_type).accumulator_class
         reduced_value = metric_accumulator.get_reduced_value_from_states(metric_states)
-        reduced_metrics[key] = reduced_value
+        metrics[key] = reduced_value
 
-    return reduced_metrics
+        # separate samples vs normal metrics
+        if first_reduction_type == Reduce.SAMPLE.value:
+            samples[key] = reduced_value
+        else:
+            metrics[key] = reduced_value
+    return metrics, samples
 
 
 #################
@@ -271,36 +294,39 @@ def __init__(self, top_k=1, bottom_k=1, key="reward"):
         self.key = key
         self._top_heap = []  # min-heap for top-k
         self._bottom_heap = []  # max-heap for bottom-k (store -value)
+        self._counter = itertools.count()  # tie-breaker id generator
 
     def filter_append(self, sample: Dict) -> bool:
         val = sample.get(self.key, 0.0)
+        idx = next(self._counter)  # unique tiebreaker
 
         # If top_k or bottom_k <= 0, it means "disable" that side of filtering (i.e., keep none).
         # maintain top-k
         if self.top_k > 0:
             if len(self._top_heap) < self.top_k:
-                heapq.heappush(self._top_heap, (val, sample))
+                heapq.heappush(self._top_heap, (val, idx, sample))
             else:
-                heapq.heappushpop(self._top_heap, (val, sample))
+                heapq.heappushpop(self._top_heap, (val, idx, sample))
 
         # maintain bottom-k
         if self.bottom_k > 0:
             if len(self._bottom_heap) < self.bottom_k:
-                heapq.heappush(self._bottom_heap, (-val, sample))
+                heapq.heappush(self._bottom_heap, (-val, idx, sample))
             else:
-                heapq.heappushpop(self._bottom_heap, (-val, sample))
+                heapq.heappushpop(self._bottom_heap, (-val, idx, sample))
 
         # always return False here because we don't store in samples list
         return False
 
     def filter_flush(self, samples: List[Dict]) -> List[Dict]:
-        tops = [s for _, s in self._top_heap]
-        bottoms = [s for _, s in self._bottom_heap]
+        tops = [s for _, _, s in self._top_heap]
+        bottoms = [s for _, _, s in self._bottom_heap]
         return bottoms + tops
 
     def reset(self):
         self._top_heap = []
         self._bottom_heap = []
+        self._counter = itertools.count()
 
 
 ################
@@ -670,14 +696,27 @@ async def flush(
 
         # Reduce metrics from states for logging if any per-rank backend
         if self.logger_backends:
-            metrics = {}
+            # Prepare two groups: normal metrics and sample-type metrics
+            metrics: Dict[str, Any] = {}
+            samples: Dict[str, list[dict]] = {}
             for key, state in states.items():
-                acc_class = Reduce(state["reduction_type"]).accumulator_class
-                metrics[key] = acc_class.get_reduced_value_from_states([state])
+                reduction_type = state["reduction_type"]
+                acc_class = Reduce(reduction_type).accumulator_class
+                value = acc_class.get_reduced_value_from_states([state])
+
+                if reduction_type == Reduce.SAMPLE.value:
+                    # sample-type metrics → list[dict]
+                    samples[key] = value
+                else:
+                    # scalar metrics → float/int/etc.
+                    metrics[key] = value
 
             # Log to local logger_backends
             for logger_backend in self.logger_backends:
-                await logger_backend.log(metrics, step)
+                if metrics:
+                    await logger_backend.log(metrics, step)
+                if samples:
+                    await logger_backend.log_samples(samples, step)
 
         return states if return_state else {}
 
@@ -728,6 +767,9 @@ async def init(
     async def log(self, metrics: Dict[str, Any], step: int) -> None:
         pass
 
+    async def log_samples(self, samples: Dict[str, List[dict]], step: int) -> None:
+        pass
+
     async def finish(self) -> None:
         pass
 
@@ -763,13 +805,13 @@ async def log_samples(self, samples: Dict[str, List[dict]], step: int) -> None:
         """Pretty-print sample-level logs to console."""
         if not samples:
             return
-        import pprint
+        import json
 
         logger.info(f"=== [{self.prefix}] - SAMPLE LOGS STEP {step} ===")
         for key, rows in samples.items():
             logger.info(f"[{key}] ({len(rows)} samples)")
             for sample in rows:
-                pretty = pprint.pformat(sample, indent=4, width=120, compact=True)
+                pretty = json.dumps(sample, indent=2, ensure_ascii=False)
                 logger.info(pretty)
         logger.info("==============================================\n")
 
@@ -805,6 +847,7 @@ def __init__(self, logger_backend_config: Dict[str, Any]):
             "reduce_across_ranks", True
         )
         self.share_run_id = logger_backend_config.get("share_run_id", False)
+        self.tables = {}  # keep persistent tables per key
 
     async def init(
         self,
@@ -891,18 +934,25 @@ async def log_samples(self, samples: Dict[str, List[dict]], step: int) -> None:
 
         if not self.run or not samples:
             return
-
         for key, rows in samples.items():
             if not rows:
                 continue
-
             # Create a WandB Table dynamically based on keys of first sample
             columns = list(rows[0].keys())
             table = wandb.Table(columns=columns)
             for sample in rows:
-                table.add_data(*[sample.get(c) for c in columns])
-
-            self.run.log({f"{key}_table": table, "global_step": step})
+                # table.add_data(*[sample.get(c) for c in columns])
+                values = [sample.get(c) for c in columns]
+                logger.info(f"Adding row to {key}_table: {values}")
+                table.add_data(*values)
+            self.run.log(
+                {
+                    f"{key}_step_{step}_table": table,
+                    "_sample_rows_logged": len(rows),
+                    "global_step": step,
+                },
+                commit=True,
+            )
             logger.info(
                 f"WandbBackend: Logged {len(rows)} samples for {key} at step {step}"
             )