Reapply "Metric Logging updates 4/N - better actor name (#351)" (#429)

Felipe Mello · Felipe Mello · commit 92326bc09cee · 2025-10-17T09:27:54.000-07:00
This reverts commit 633b219.
diff --git a/apps/grpo/main.py b/apps/grpo/main.py
@@ -305,7 +305,7 @@ async def main(cfg: DictConfig):
         provisioner = await init_provisioner()
 
     metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})
-    mlogger = await get_or_create_metric_logger()
+    mlogger = await get_or_create_metric_logger(process_name="Controller")
     await mlogger.init_backends.call_one(metric_logging_cfg)
 
     # ---- Setup services ---- #
diff --git a/src/forge/observability/__init__.py b/src/forge/observability/__init__.py
@@ -12,8 +12,6 @@
 from .metrics import (
     BackendRole,
     ConsoleBackend,
-    get_actor_name_with_rank,
-    get_logger_backend_class,
     LoggerBackend,
     MaxAccumulator,
     MeanAccumulator,
@@ -29,12 +27,12 @@
     WandbBackend,
 )
 from .perf_tracker import trace, Tracer
+from .utils import get_proc_name_with_rank
 
 __all__ = [
     # Main API functions
     "record_metric",
     "reduce_metrics_states",
-    "get_actor_name_with_rank",
     "get_logger_backend_class",
     "get_or_create_metric_logger",
     # Performance tracking
@@ -45,6 +43,8 @@
     "BackendRole",
     # Enums
     "Reduce",
+    # Utility functions
+    "get_proc_name_with_rank",
     # Actor classes
     "GlobalLoggingActor",
     "LocalFetcherActor",
diff --git a/tests/sandbox/toy_rl/toy_metrics/main.py b/tests/sandbox/toy_rl/toy_metrics/main.py
@@ -95,12 +95,16 @@ async def main():
     }
 
     service_config = {"procs": 2, "num_replicas": 2, "with_gpus": False}
-    mlogger = await get_or_create_metric_logger()
+    mlogger = await get_or_create_metric_logger(process_name="Controller")
     await mlogger.init_backends.call_one(config)
 
     # Spawn services first (triggers registrations via provisioner hook)
-    trainer = await TrainActor.options(**service_config).as_service()
-    generator = await GeneratorActor.options(**service_config).as_service()
+    trainer = await TrainActor.options(
+        **service_config, mesh_name="TrainActor"
+    ).as_service()
+    generator = await GeneratorActor.options(
+        **service_config, mesh_name="GeneratorActor"
+    ).as_service()
 
     for i in range(3):
         print(f"\n=== Global Step {i} ===")
diff --git a/tests/sandbox/vllm/main.py b/tests/sandbox/vllm/main.py
@@ -33,7 +33,7 @@ async def run(cfg: DictConfig):
             ProvisionerConfig(launcher_config=LauncherConfig(**cfg.provisioner))
         )
     metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})
-    mlogger = await get_or_create_metric_logger()
+    mlogger = await get_or_create_metric_logger(process_name="Controller")
     await mlogger.init_backends.call_one(metric_logging_cfg)
 
     if (prompt := cfg.get("prompt")) is None:
diff --git a/tests/unit_tests/observability/conftest.py b/tests/unit_tests/observability/conftest.py
@@ -22,13 +22,14 @@ def __init__(self, logger_backend_config=None):
         self.finish_called = False
         self.metadata = {}
 
-    async def init(self, role="local", primary_logger_metadata=None):
+    async def init(self, role="local", primary_logger_metadata=None, process_name=None):
         self.init_called = True
         self.role = role
         self.primary_logger_metadata = primary_logger_metadata or {}
+        self.process_name = process_name
 
-    async def log(self, metrics, step):
-        self.logged_metrics.append((metrics, step))
+    async def log(self, metrics, global_step):
+        self.logged_metrics.append((metrics, global_step))
 
     async def finish(self):
         self.finish_called = True
diff --git a/tests/unit_tests/observability/test_metric_actors.py b/tests/unit_tests/observability/test_metric_actors.py
@@ -0,0 +1,162 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Optimized unit tests for metric actors functionality."""
+
+import pytest
+
+from forge.observability.metric_actors import (
+    get_or_create_metric_logger,
+    GlobalLoggingActor,
+    LocalFetcherActor,
+)
+from monarch.actor import this_host
+
+
+@pytest.fixture
+def global_logger():
+    """Create a GlobalLoggingActor for testing."""
+    p = this_host().spawn_procs(per_host={"cpus": 1})
+    return p.spawn("TestGlobalLogger", GlobalLoggingActor)
+
+
+@pytest.fixture
+def local_fetcher(global_logger):
+    """Create a LocalFetcherActor linked to global logger."""
+    p = this_host().spawn_procs(per_host={"cpus": 1})
+    return p.spawn("TestLocalFetcher", LocalFetcherActor, global_logger)
+
+
+class TestBasicOperations:
+    """Test basic operations for actors."""
+
+    @pytest.mark.asyncio
+    async def test_local_fetcher_flush(self, local_fetcher):
+        """Test LocalFetcherActor flush operations."""
+        result_with_state = await local_fetcher.flush.call_one(
+            global_step=1, return_state=True
+        )
+        assert result_with_state == {}
+
+        result_without_state = await local_fetcher.flush.call_one(
+            global_step=1, return_state=False
+        )
+        assert result_without_state == {}
+
+    @pytest.mark.asyncio
+    async def test_global_logger_basic_ops(self, global_logger):
+        """Test GlobalLoggingActor basic operations."""
+        count = await global_logger.get_fetcher_count.call_one()
+        assert count >= 0
+
+        has_fetcher = await global_logger.has_fetcher.call_one("nonexistent")
+        assert has_fetcher is False
+
+        # Global logger flush (should not raise error)
+        await global_logger.flush.call_one(global_step=1)
+
+    @pytest.mark.asyncio
+    async def test_backend_init(self, local_fetcher):
+        """Test backend initialization and shutdown."""
+        metadata = {"wandb": {"shared_run_id": "test123"}}
+        config = {"console": {"logging_mode": "per_rank_reduce"}}
+
+        await local_fetcher.init_backends.call_one(metadata, config, global_step=5)
+        await local_fetcher.shutdown.call_one()
+
+
+class TestRegistrationLifecycle:
+    """Test registration lifecycle."""
+
+    @pytest.mark.timeout(3)
+    @pytest.mark.asyncio
+    async def test_registration_lifecycle(self, global_logger, local_fetcher):
+        """Test complete registration/deregistration lifecycle."""
+        proc_name = "lifecycle_test_proc"
+
+        # Initial state
+        initial_count = await global_logger.get_fetcher_count.call_one()
+        assert await global_logger.has_fetcher.call_one(proc_name) is False
+
+        # Register
+        await global_logger.register_fetcher.call_one(local_fetcher, proc_name)
+
+        # Verify registered
+        new_count = await global_logger.get_fetcher_count.call_one()
+        assert new_count == initial_count + 1
+        assert await global_logger.has_fetcher.call_one(proc_name) is True
+
+        # Deregister
+        await global_logger.deregister_fetcher.call_one(proc_name)
+
+        # Verify deregistered
+        final_count = await global_logger.get_fetcher_count.call_one()
+        assert final_count == initial_count
+        assert await global_logger.has_fetcher.call_one(proc_name) is False
+
+
+class TestBackendConfiguration:
+    """Test backend configuration validation."""
+
+    @pytest.mark.timeout(3)
+    @pytest.mark.asyncio
+    async def test_valid_backend_configs(self, global_logger):
+        """Test valid backend configurations."""
+        # Empty config
+        await global_logger.init_backends.call_one({})
+
+        # Valid configs for all logging modes
+        for mode in ["per_rank_reduce", "per_rank_no_reduce", "global_reduce"]:
+            config = {"console": {"logging_mode": mode}}
+            await global_logger.init_backends.call_one(config)
+
+    @pytest.mark.timeout(3)
+    @pytest.mark.asyncio
+    async def test_invalid_backend_configs(self, global_logger):
+        """Test invalid backend configurations are handled gracefully."""
+        # Empty config should work
+        await global_logger.init_backends.call_one({})
+
+        # Config with only project should work
+        config_with_project = {"console": {"project": "test_project"}}
+        await global_logger.init_backends.call_one(config_with_project)
+
+        # Config with reduce_across_ranks should work (Diff 3 doesn't validate logging_mode yet)
+        config_with_reduce = {"console": {"reduce_across_ranks": True}}
+        await global_logger.init_backends.call_one(config_with_reduce)
+
+
+class TestErrorHandling:
+    """Test error handling scenarios."""
+
+    @pytest.mark.timeout(3)
+    @pytest.mark.asyncio
+    async def test_deregister_nonexistent_fetcher(self, global_logger):
+        """Test deregistering non-existent fetcher doesn't crash."""
+        await global_logger.deregister_fetcher.call_one("nonexistent_proc")
+
+    @pytest.mark.timeout(3)
+    @pytest.mark.asyncio
+    async def test_shutdown(self, global_logger):
+        """Test shutdown without issues."""
+        await global_logger.shutdown.call_one()
+
+
+class TestGetOrCreateMetricLogger:
+    """Test the integration function."""
+
+    @pytest.mark.timeout(3)
+    @pytest.mark.asyncio
+    async def test_get_or_create_functionality(self):
+        """Test get_or_create_metric_logger basic functionality."""
+        result = await get_or_create_metric_logger(process_name="TestController")
+
+        # Should return a GlobalLoggingActor mesh
+        assert result is not None
+
+        # Should be able to call basic methods
+        count = await result.get_fetcher_count.call_one()
+        assert count >= 0
diff --git a/tests/unit_tests/observability/test_metrics.py b/tests/unit_tests/observability/test_metrics.py
@@ -80,12 +80,9 @@ def test_new_enums_and_constants(self):
         assert isinstance(BackendRole.LOCAL, BackendRole)
         assert isinstance(BackendRole.GLOBAL, BackendRole)
 
-    @patch("forge.observability.metrics.get_actor_name_with_rank")
     @pytest.mark.asyncio
-    async def test_backend_role_usage(self, mock_actor_name):
+    async def test_backend_role_usage(self):
         """Test that BackendRole constants are actually used instead of string literals."""
-        mock_actor_name.return_value = "TestActor_abcd_r0"
-
         # Test ConsoleBackend
         console_backend = ConsoleBackend({})
         await console_backend.init(role=BackendRole.LOCAL)
@@ -295,10 +292,8 @@ def test_record_metric_enabled_explicit(self, mock_collector_class, mock_rank):
         mock_collector_class.assert_called_once()
         mock_collector.push.assert_called_once()
 
-    @patch("forge.observability.metrics.get_actor_name_with_rank")
-    def test_wandb_backend_creation(self, mock_actor_name):
+    def test_wandb_backend_creation(self):
         """Test WandbBackend creation and basic setup without WandB dependency."""
-        mock_actor_name.return_value = "TestActor_abcd_r0"
 
         config = {
             "project": "test_project",
@@ -316,12 +311,9 @@ def test_wandb_backend_creation(self, mock_actor_name):
         metadata = backend.get_metadata_for_secondary_ranks()
         assert metadata == {}  # Should be empty when no run
 
-    @patch("forge.observability.metrics.get_actor_name_with_rank")
     @pytest.mark.asyncio
-    async def test_console_backend(self, mock_actor_name):
+    async def test_console_backend(self):
         """Test ConsoleBackend basic operations."""
-        mock_actor_name.return_value = "TestActor_abcd_r0"
-
         backend = ConsoleBackend({})
 
         await backend.init(role=BackendRole.LOCAL)
@@ -425,8 +417,10 @@ async def _test_fetcher_registration(self, env_var_value, should_register_fetche
         if hasattr(procs, "_local_fetcher"):
             delattr(procs, "_local_fetcher")
 
-        # Test functionality
-        global_logger = await get_or_create_metric_logger(proc_mesh=procs)
+        # Test functionality - pass explicit process_name since test bypasses provisioner
+        global_logger = await get_or_create_metric_logger(
+            proc_mesh=procs, process_name="TestProcess"
+        )
 
         # Get results to check
         proc_has_fetcher = hasattr(procs, "_local_fetcher")

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ async def run(cfg: DictConfig):`
`33`	`33`	`ProvisionerConfig(launcher_config=LauncherConfig(**cfg.provisioner))`
`34`	`34`	`)`
`35`	`35`	`metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})`
`36`		`- mlogger = await get_or_create_metric_logger()`
	`36`	`+ mlogger = await get_or_create_metric_logger(process_name="Controller")`
`37`	`37`	`await mlogger.init_backends.call_one(metric_logging_cfg)`
`38`	`38`
`39`	`39`	`if (prompt := cfg.get("prompt")) is None:`