meta-pytorch · felipemello1 · Sep 29, 2025 · Sep 19, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/apps/grpo/qwen3_1_7b.yaml b/apps/grpo/qwen3_1_7b.yaml
@@ -1,5 +1,5 @@
 # Grouped Relative Policy Optimization (GRPO)
-# >>> python -m apps.grpo.qwen3_1_7b --config apps/grpo/qwen3_1_7b.yaml
+# >>> python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml
 
 # Global configuration
 group_size: 8

diff --git a/apps/toy_metrics/main.py b/apps/toy_metrics/main.py
@@ -0,0 +1,91 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import asyncio
+
+import logging
+import time
+
+from forge.controller.actor import ForgeActor
+from forge.controller.provisioner import shutdown
+from forge.observability.metric_actors import setup_metric_logger
+from forge.observability.metrics import record_metric, ReductionType
+
+from monarch.actor import current_rank, endpoint
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class TrainActor(ForgeActor):
+    """Example training actor that records loss metrics."""
+
+    @endpoint
+    async def train_step(self, step: int):
+        rank = current_rank().rank
+        value = rank * 1000 + 100 * step
+        print(f"[TRAIN] Rank {rank}: Step {step}, loss={value}")
+        record_metric("train/loss", value)
+
+
+class GeneratorActor(ForgeActor):
+    """Example generation actor that records token count metrics."""
+
+    @endpoint
+    async def generate_step(self, step: int, substep: int):
+        rank = current_rank().rank
+        value = rank * 1000 + step * 100 + substep * 10
+        print(f"[GEN] Rank {rank}: Step {step}.{substep}, tokens={value}")
+        record_metric("generate/tokens", value, ReductionType.SUM)
+
+
+# Main
+async def main():
+    """Example demonstrating distributed metric logging with different backends."""
+    group = f"grpo_exp_{int(time.time())}"
+
+    # Config format: {backend_name: backend_config_dict}
+    # Each backend can specify reduce_across_ranks to control distributed logging behavior
+    config = {
+        "console": {"reduce_across_ranks": True},
+        "wandb": {
+            "project": "my_project",
+            "group": group,
+            "reduce_across_ranks": True,
+            # Only useful if NOT reduce_across_ranks.
+            "share_run_id": False,  # Share run ID across ranks -- Not recommended.
+        },
+    }
+
+    service_config = {"procs": 2, "num_replicas": 2, "with_gpus": False}
+    mlogger = await setup_metric_logger()
+
+    # Spawn services first (triggers registrations via provisioner hook)
+    trainer = await TrainActor.options(**service_config).as_service()
+    generator = await GeneratorActor.options(**service_config).as_service()
+
+    # Now init config on global (inits backends eagerly across fetchers)
+    await mlogger.init_backends.call_one(config)
+
+    for i in range(3):
+        print(f"\n=== Global Step {i} ===")
+        await trainer.train_step.fanout(i)
+        for sub in range(3):
+            await generator.generate_step.fanout(i, sub)
+        await mlogger.flush.call_one(i)
+
+    # shutdown
+    await mlogger.shutdown.call_one()
+
+    await asyncio.gather(
+        trainer.shutdown(),
+        generator.shutdown(),
+    )
+
+    await shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/forge/controller/__init__.py b/src/forge/controller/__init__.py
@@ -3,7 +3,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 from .actor import ForgeActor
 from .proc_mesh import get_proc_mesh, stop_proc_mesh
 
@@ -24,9 +23,4 @@ async def spawn_actors(
     return actors
 
 
-__all__ = [
-    "spawn_actors",
-    "stop_proc_mesh",
-    "get_proc_mesh",
-    "ForgeActor",
-]
+__all__ = ["spawn_actors", "stop_proc_mesh", "get_proc_mesh", "ForgeActor"]
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
@@ -21,6 +21,8 @@
 from monarch.tools.components import hyperactor
 from monarch.tools.config import Config
 
+from forge.observability.metric_actors import setup_metric_logger
+
 from forge.types import ProcessConfig
 
 logger = logging.getLogger(__name__)
@@ -215,11 +217,19 @@ def bootstrap(gpu_ids: list[str]):
                 self._server_names.append(server_name)
                 self._proc_server_map[procs] = server_name
 
+        # Spawn local logging actor on each process and register with global logger
+        _ = await setup_metric_logger(procs)
+
         return procs
 
     async def stop_proc_mesh(self, proc_mesh: ProcMesh):
         """Stops a proc mesh."""
         async with self._lock:
+            # Deregister local logger from global logger
+            if hasattr(proc_mesh, "_local_fetcher"):
+                global_logger = await setup_metric_logger(proc_mesh)
+                await global_logger.deregister_fetcher.call_one(proc_mesh)
+
             if hasattr(proc_mesh, "_gpu_ids"):
                 gpu_manager = self._host_gpu_map[proc_mesh._host._host_id]
                 gpu_manager.release_gpus(proc_mesh._gpu_ids)

diff --git a/src/forge/observability/__init__.py b/src/forge/observability/__init__.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .metric_actors import GlobalLoggingActor, LocalFetcherActor, setup_metric_logger
+from .metrics import (
+    ConsoleBackend,
+    # Utility functions
+    get_actor_name_with_rank,
+    get_logger_backend_class,
+    # Backend classes
+    LoggerBackend,
+    MaxAccumulator,
+    MeanAccumulator,
+    # Accumulator classes
+    MetricAccumulator,
+    MetricCollector,
+    MinAccumulator,
+    record_metric,
+    reduce_metrics_states,
+    ReductionType,
+    StdAccumulator,
+    SumAccumulator,
+    WandbBackend,
+)
+
+__all__ = [
+    # Main API functions
+    "record_metric",
+    "reduce_metrics_states",
+    "get_actor_name_with_rank",
+    "get_logger_backend_class",
+    "setup_metric_logger",
+    # Enums
+    "ReductionType",
+    # Actor classes
+    "GlobalLoggingActor",
+    "LocalFetcherActor",
+    # Collector
+    "MetricCollector",
+    # Backend classes
+    "LoggerBackend",
+    "ConsoleBackend",
+    "WandbBackend",
+    # Accumulator classes
+    "MetricAccumulator",
+    "MeanAccumulator",
+    "SumAccumulator",
+    "MaxAccumulator",
+    "MinAccumulator",
+    "StdAccumulator",
+]