merge filter into sampler

DNXie · DNXie · commit 2d52ebfe45e3 · 2025-11-04T20:24:49.000-08:00
diff --git a/src/forge/observability/__init__.py b/src/forge/observability/__init__.py
@@ -28,7 +28,6 @@
     SampleAccumulator,
     StdAccumulator,
     SumAccumulator,
-    TopBottomKFilter,
     WandbBackend,
 )
 from .perf_tracker import trace, Tracer
@@ -69,6 +68,4 @@
     "MinAccumulator",
     "StdAccumulator",
     "SampleAccumulator",
-    # Filter classes
-    "TopBottomKFilter",
 ]
diff --git a/src/forge/observability/metric_actors.py b/src/forge/observability/metric_actors.py
@@ -428,9 +428,9 @@ def extract_values_from_valuemesh(results) -> list[dict[str, Any]]:
             scalar_metrics = [
                 m for m in reduced_metrics if m.reduction != Reduce.SAMPLE
             ]
-            sample_metrics = {
+            sample_metrics = [
                 m for m in reduced_metrics if m.reduction == Reduce.SAMPLE
-            }
+            ]
 
             # Log to global backends
             for backend_name, backend in self.global_logger_backends.items():
diff --git a/src/forge/observability/metrics.py b/src/forge/observability/metrics.py
@@ -199,55 +199,6 @@ def record_episode_sample(table_name: str, episode):
     record_metric(table_name, sample, Reduce.SAMPLE)
 
 
-#################
-# SampleFilters #
-#################
-
-
-class TopBottomKFilter:
-    """Keep the top-k and bottom-k samples by a given key (e.g., reward)."""
-
-    def __init__(self, top_k=1, bottom_k=1, key="reward"):
-        self.top_k = top_k
-        self.bottom_k = bottom_k
-        self.key = key
-        self._top_heap = []  # min-heap for top-k
-        self._bottom_heap = []  # max-heap for bottom-k (store -value)
-        self._counter = itertools.count()  # tie-breaker id generator
-
-    def filter_append(self, sample: Dict) -> bool:
-        val = sample.get(self.key, 0.0)
-        idx = next(self._counter)  # unique tiebreaker
-
-        # If top_k or bottom_k <= 0, it means "disable" that side of filtering (i.e., keep none).
-        # maintain top-k
-        if self.top_k > 0:
-            if len(self._top_heap) < self.top_k:
-                heapq.heappush(self._top_heap, (val, idx, sample))
-            else:
-                heapq.heappushpop(self._top_heap, (val, idx, sample))
-
-        # maintain bottom-k
-        if self.bottom_k > 0:
-            if len(self._bottom_heap) < self.bottom_k:
-                heapq.heappush(self._bottom_heap, (-val, idx, sample))
-            else:
-                heapq.heappushpop(self._bottom_heap, (-val, idx, sample))
-
-        # always return False here because we don't store in samples list
-        return False
-
-    def filter_flush(self, samples: List[Dict]) -> List[Dict]:
-        tops = [s for _, _, s in self._top_heap]
-        bottoms = [s for _, _, s in self._bottom_heap]
-        return bottoms + tops
-
-    def reset(self):
-        self._top_heap = []
-        self._bottom_heap = []
-        self._counter = itertools.count()
-
-
 ################
 # Accumulators #
 ################
@@ -459,30 +410,53 @@ def reset(self) -> None:
 
 
 class SampleAccumulator(MetricAccumulator):
-    """Accumulator for sample-level metrics (e.g., prompt/response/reward dicts).
-    Optionally uses a sample filter to decide what to keep at append/flush time.
+    """Accumulator for sample-level metrics with top-k and bottom-k filtering.
+
+    Keeps the top-k and bottom-k samples by a given key (e.g., reward).
+    Useful for logging only the best and worst samples from a batch.
     """
 
-    def __init__(self, reduction: Reduce):
+    def __init__(
+        self, reduction: Reduce, top_k: int = 1, bottom_k: int = 1, key: str = "reward"
+    ):
         super().__init__(reduction)
         self.samples: List[Dict[str, Any]] = []
-        self.filter = TopBottomKFilter()
+        self.top_k = top_k
+        self.bottom_k = bottom_k
+        self.key = key
+        self._top_heap = []  # min-heap for top-k
+        self._bottom_heap = []  # max-heap for bottom-k (store -value)
+        self._counter = itertools.count()  # tie-breaker id generator
         self.is_reset = True
 
     def append(self, value: dict) -> None:
         if not isinstance(value, dict):
             raise ValueError(f"Expected dict, got {type(value)}")
 
         self.is_reset = False
-        # Only keep the sample if filter_append returns True
-        if self.filter.filter_append(value):
-            self.samples.append(value)
+        val = value.get(self.key, 0.0)
+        idx = next(self._counter)  # unique tiebreaker
+
+        # If top_k or bottom_k <= 0, it means "disable" that side of filtering (i.e., keep none).
+        # maintain top-k
+        if self.top_k > 0:
+            if len(self._top_heap) < self.top_k:
+                heapq.heappush(self._top_heap, (val, idx, value))
+            else:
+                heapq.heappushpop(self._top_heap, (val, idx, value))
+
+        # maintain bottom-k
+        if self.bottom_k > 0:
+            if len(self._bottom_heap) < self.bottom_k:
+                heapq.heappush(self._bottom_heap, (-val, idx, value))
+            else:
+                heapq.heappushpop(self._bottom_heap, (-val, idx, value))
 
     def get_value(self) -> list[dict]:
-        """Return locally collected (and optionally filtered) samples."""
-        # Apply flush-time filter (e.g. heap selection, threshold trimming)
-        results = self.filter.filter_flush(self.samples)
-        return results
+        """Return top-k and bottom-k filtered samples."""
+        tops = [s for _, _, s in self._top_heap]
+        bottoms = [s for _, _, s in self._bottom_heap]
+        return bottoms + tops
 
     def get_state(self) -> Dict[str, Any]:
         """Serialize accumulator state for cross-rank reduction."""
@@ -503,7 +477,9 @@ def reset(self) -> None:
         """Clear local samples and reset filter state."""
         self.is_reset = True
         self.samples.clear()
-        self.filter.reset()
+        self._top_heap = []
+        self._bottom_heap = []
+        self._counter = itertools.count()
 
 
 #############
diff --git a/tests/unit_tests/data/test_metrics_aggregator.py b/tests/unit_tests/data/test_metrics_aggregator.py
@@ -247,7 +247,7 @@ def test_handler_replacement_warning(self, caplog):
         assert "Replacing handler for AggregationType.SUM" in caplog.records[0].message
 
     def test_sample_accumulator_with_topbottom_filter(self):
-        """Ensure SampleAccumulator integrates with TopBottomKFilter correctly."""
+        """Ensure SampleAccumulator samples top and bottom correctly."""
         from forge.observability.metrics import Reduce, SampleAccumulator
 
         acc = SampleAccumulator(Reduce.SAMPLE)

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,6 @@`
`28`	`28`	`SampleAccumulator,`
`29`	`29`	`StdAccumulator,`
`30`	`30`	`SumAccumulator,`
`31`		`- TopBottomKFilter,`
`32`	`31`	`WandbBackend,`
`33`	`32`	`)`
`34`	`33`	`from .perf_tracker import trace, Tracer`
`@@ -69,6 +68,4 @@`
`69`	`68`	`"MinAccumulator",`
`70`	`69`	`"StdAccumulator",`
`71`	`70`	`"SampleAccumulator",`
`72`		`- # Filter classes`
`73`		`- "TopBottomKFilter",`
`74`	`71`	`]`