add accumulator and test

DNXie · DNXie · commit 37c2ac99f76b · 2025-11-16T17:08:27.000-08:00
diff --git a/src/forge/observability/__init__.py b/src/forge/observability/__init__.py
@@ -24,8 +24,10 @@
     record_metric,
     Reduce,
     reduce_metrics_states,
+    SampleAccumulator,
     StdAccumulator,
     SumAccumulator,
+    TopBottomKFilter,
     WandbBackend,
 )
 from .perf_tracker import trace, Tracer
@@ -64,4 +66,7 @@
     "MaxAccumulator",
     "MinAccumulator",
     "StdAccumulator",
+    "SampleAccumulator",
+    # Filter classes
+    "TopBottomKFilter",
 ]
diff --git a/src/forge/observability/metrics.py b/src/forge/observability/metrics.py
@@ -4,13 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import heapq
+import itertools
 import logging
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
-from typing import Any
+from typing import Any, Dict, List
 
 import pytz
 
@@ -68,6 +70,7 @@ class Reduce(Enum):
     MAX = "max"
     MIN = "min"
     STD = "std"
+    SAMPLE = "sample"
 
     @property
     def accumulator_class(self):
@@ -77,6 +80,7 @@ def accumulator_class(self):
             Reduce.MAX: MaxAccumulator,
             Reduce.MIN: MinAccumulator,
             Reduce.STD: StdAccumulator,
+            Reduce.SAMPLE: SampleAccumulator,
         }
         return mapping[self]
 
@@ -182,6 +186,55 @@ def reduce_metrics_states(states: list[dict[str, dict[str, Any]]]) -> list[Metri
     return reduced_metrics
 
 
+#################
+# SampleFilters #
+#################
+
+
+class TopBottomKFilter:
+    """Keep the top-k and bottom-k samples by a given key (e.g., reward)."""
+
+    def __init__(self, top_k=1, bottom_k=1, key="reward"):
+        self.top_k = top_k
+        self.bottom_k = bottom_k
+        self.key = key
+        self._top_heap = []  # min-heap for top-k
+        self._bottom_heap = []  # max-heap for bottom-k (store -value)
+        self._counter = itertools.count()  # tie-breaker id generator
+
+    def filter_append(self, sample: Dict) -> bool:
+        val = sample.get(self.key, 0.0)
+        idx = next(self._counter)  # unique tiebreaker
+
+        # If top_k or bottom_k <= 0, it means "disable" that side of filtering (i.e., keep none).
+        # maintain top-k
+        if self.top_k > 0:
+            if len(self._top_heap) < self.top_k:
+                heapq.heappush(self._top_heap, (val, idx, sample))
+            else:
+                heapq.heappushpop(self._top_heap, (val, idx, sample))
+
+        # maintain bottom-k
+        if self.bottom_k > 0:
+            if len(self._bottom_heap) < self.bottom_k:
+                heapq.heappush(self._bottom_heap, (-val, idx, sample))
+            else:
+                heapq.heappushpop(self._bottom_heap, (-val, idx, sample))
+
+        # always return False here because we don't store in samples list
+        return False
+
+    def filter_flush(self, samples: List[Dict]) -> List[Dict]:
+        tops = [s for _, _, s in self._top_heap]
+        bottoms = [s for _, _, s in self._bottom_heap]
+        return bottoms + tops
+
+    def reset(self):
+        self._top_heap = []
+        self._bottom_heap = []
+        self._counter = itertools.count()
+
+
 ################
 # Accumulators #
 ################
@@ -392,6 +445,50 @@ def reset(self) -> None:
         self.count = 0
 
 
+class SampleAccumulator(MetricAccumulator):
+    """Accumulator for sample-level metrics (e.g., prompt/response/reward dicts).
+    Optionally uses a sample filter to decide what to keep at append/flush time.
+    """
+
+    def __init__(self, reduction: Reduce):
+        super().__init__(reduction)
+        self.samples: List[Dict[str, Any]] = []
+        self.filter = TopBottomKFilter()
+
+    def append(self, value: dict) -> None:
+        if not isinstance(value, dict):
+            raise ValueError(f"Expected dict, got {type(value)}")
+
+        # Only keep the sample if filter_append returns True
+        if self.filter.filter_append(value):
+            self.samples.append(value)
+
+    def get_value(self) -> list[dict]:
+        """Return locally collected (and optionally filtered) samples."""
+        # Apply flush-time filter (e.g. heap selection, threshold trimming)
+        return self.filter.filter_flush(self.samples)
+
+    def get_state(self) -> Dict[str, Any]:
+        """Serialize accumulator state for cross-rank reduction."""
+        return {
+            "reduction_type": self.reduction_type.value,
+            "samples": self.get_value(),
+        }
+
+    @classmethod
+    def get_reduced_value_from_states(cls, states: List[Dict[str, Any]]) -> list[dict]:
+        """Merge sample states across ranks."""
+        merged = []
+        for s in states:
+            merged.extend(s.get("samples", []))
+        return merged
+
+    def reset(self) -> None:
+        """Clear local samples and reset filter state."""
+        self.samples.clear()
+        self.filter.reset()
+
+
 #############
 # Collector #
 #############