[DCP] Add option to use PrefixStore to create checkpoint background process (pytorch#166560)

kevinmtang · pytorchmergebot · commit 2b7e4c3ef2b9 · 2025-11-03T23:08:12.000Z
Summary: DCP checkpoint background process currently determines the port used for pg via get_free_port(). During checkpoint background process initialization, gloo pg init occasionally times out on the first call but succeeds in a subsequent call. We hypothesized that the timeouts are related to the port being used, and the solution would be to create the pg with PrefixStore and reuse the master port. This diff adds the option for checkpoint background process to use PrefixStore with MASTER_ADDR + MASTER_PORT. The default behavior is unchanged. Enabling the new PrefixStore behavior requires setting "DCP_USE_PREFIX_STORE" env var to "1". context: https://fb.workplace.com/groups/319878845696681/permalink/1516883985996155/ Differential Revision: D84928180 Pull Request resolved: pytorch#166560 Approved by: https://github.com/meetv18
diff --git a/test/distributed/checkpoint/test_async_process_executor.py b/test/distributed/checkpoint/test_async_process_executor.py
@@ -1,16 +1,26 @@
 # Owner(s): ["oncall: distributed checkpointing"]
 
+import os
 import sys
 from unittest.mock import patch
 
 import torch
+import torch.testing._internal.common_utils as common
 from torch import distributed as dist
 from torch.distributed.checkpoint._async_process_executor import (
     _ProcessBasedAsyncCheckpointExecutor,
+    _ProcessGroupInitInfo,
 )
+from torch.distributed.checkpoint.api import CheckpointException
 from torch.distributed.checkpoint.storage import StorageWriter
 from torch.distributed.elastic.utils.distributed import get_free_port
-from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
+from torch.testing._internal.common_distributed import skip_if_win32
+from torch.testing._internal.common_utils import (
+    retry_on_connect_failures,
+    run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
+    TestCase,
+)
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
@@ -110,47 +120,184 @@ def test_checkpoint_save_failure_continues_serving(self) -> None:
             "epoch": 5,
         }
 
-        # 1. Simulate a failure in creating PG in background process.
-        with patch(
-            "torch.distributed.checkpoint._async_process_executor.get_free_port",
-            return_value=-1,
+        with patch.dict(os.environ, {}, clear=False):
+            os.environ.pop("DCP_USE_PREFIX_STORE", None)
+
+            # 1. Simulate a failure in creating PG in background process.
+            with patch(
+                "torch.distributed.checkpoint._async_process_executor.get_free_port",
+                return_value=-1,
+            ):
+                with self.assertRaises(ValueError) as _:
+                    proc_executor = _ProcessBasedAsyncCheckpointExecutor()
+                    fut = proc_executor.execute_save(
+                        staging_future_or_state_dict=test_state_dict,
+                    )
+                    fut.result()
+
+            # 2. Attempt save with failing storage writer
+            with patch(
+                "torch.distributed.checkpoint._async_process_executor.get_free_port",
+                return_value=get_free_port(),
+            ) as mock_get_free_port:
+                proc_executor = _ProcessBasedAsyncCheckpointExecutor()
+                fut = proc_executor.execute_save(
+                    staging_future_or_state_dict=test_state_dict,
+                    storage_writer=TestStorageWriter(behavior="fail_once"),
+                )
+                self.assertIn(
+                    "fail_once policy triggered failure", str(fut.exception())
+                )
+                # Verify new process was created for this attempt
+                if dist.get_rank() == 0:
+                    mock_get_free_port.assert_called_once()
+
+            # 3. Second save attempt with successful storage writer - process should still be alive
+            with patch(
+                "torch.distributed.checkpoint._async_process_executor.get_free_port",
+            ) as mock_get_free_port:
+                proc_executor = _ProcessBasedAsyncCheckpointExecutor()
+                fut = proc_executor.execute_save(
+                    staging_future_or_state_dict=test_state_dict,
+                    storage_writer=TestStorageWriter(behavior="success"),
+                )
+                result = fut.result()
+                # Verify process is still alive
+                mock_get_free_port.assert_not_called()
+                # Verify successful save
+                self.assertIsNotNone(result)
+
+
+class TestAsyncProcessExecutorPrefixStore(TestCase):
+    @skip_if_win32()
+    @retry_on_connect_failures
+    def test_checkpoint_save_with_prefix_store_enabled(self) -> None:
+        """Test that checkpoint save works when DCP_USE_PREFIX_STORE is enabled."""
+
+        test_state_dict = {
+            "model": {"weight": torch.randn(4, 4), "bias": torch.randn(4)},
+            "optimizer": {"param_groups": [{"lr": 0.01}]},
+            "epoch": 5,
+        }
+
+        master_addr = "localhost"
+        master_port = str(common.find_free_port())
+
+        with patch.dict(
+            os.environ,
+            {
+                "DCP_USE_PREFIX_STORE": "1",
+                "MASTER_ADDR": master_addr,
+                "MASTER_PORT": master_port,
+            },
         ):
-            with self.assertRaises(ValueError) as _:
+            with patch(
+                "torch.distributed.checkpoint._async_process_executor.get_free_port"
+            ) as mock_get_free_port:
+                dist.init_process_group(
+                    backend=dist.Backend.GLOO,
+                    rank=0,
+                    world_size=1,
+                )
+
                 proc_executor = _ProcessBasedAsyncCheckpointExecutor()
                 fut = proc_executor.execute_save(
                     staging_future_or_state_dict=test_state_dict,
+                    storage_writer=TestStorageWriter(behavior="success"),
                 )
-                fut.result()
-
-        # 2. Attempt save with failing storage writer
-        with patch(
-            "torch.distributed.checkpoint._async_process_executor.get_free_port",
-            return_value=get_free_port(),
-        ) as mock_get_free_port:
-            proc_executor = _ProcessBasedAsyncCheckpointExecutor()
-            fut = proc_executor.execute_save(
-                staging_future_or_state_dict=test_state_dict,
-                storage_writer=TestStorageWriter(behavior="fail_once"),
-            )
-            self.assertIn("fail_once policy triggered failure", str(fut.exception()))
-            # Verify new process was created for this attempt
-            if dist.get_rank() == 0:
-                mock_get_free_port.assert_called_once()
-
-        # 3. Second save attempt with successful storage writer - process should still be alive
-        with patch(
-            "torch.distributed.checkpoint._async_process_executor.get_free_port",
-        ) as mock_get_free_port:
-            proc_executor = _ProcessBasedAsyncCheckpointExecutor()
-            fut = proc_executor.execute_save(
-                staging_future_or_state_dict=test_state_dict,
-                storage_writer=TestStorageWriter(behavior="success"),
-            )
-            result = fut.result()
-            # Verify process is still alive
-            mock_get_free_port.assert_not_called()
-            # Verify successful save
-            self.assertIsNotNone(result)
+                result = fut.result()
+                self.assertIsNotNone(result)
+                mock_get_free_port.assert_not_called()
+
+
+class TestProcessGroupInitInfo(DTensorTestBase):
+    """Test suite for _ProcessGroupInitInfo."""
+
+    @with_comms
+    def test_process_group_init_info_with_default_pg(self) -> None:
+        """Test that ProcessGroupInitInfo correctly initializes."""
+        with patch.dict(os.environ, {}, clear=False):
+            os.environ.pop("DCP_USE_PREFIX_STORE", None)
+
+            pg_init_info = _ProcessGroupInitInfo()
+
+            self.assertEqual(pg_init_info.global_rank, dist.get_rank())
+            self.assertEqual(pg_init_info.world_size, dist.get_world_size())
+            self.assertIsNotNone(pg_init_info.tcp_store_master_addr)
+            self.assertGreater(pg_init_info.tcp_store_master_port, 0)
+            self.assertEqual(pg_init_info.use_prefix_store, False)
+
+    @with_comms
+    def test_process_group_init_info_with_prefix_store_env_var(self) -> None:
+        """Test that ProcessGroupInitInfo handles DCP_USE_PREFIX_STORE environment variable."""
+
+        # Flag enabled, addr/port correctly defined
+        with patch.dict(
+            os.environ,
+            {
+                "DCP_USE_PREFIX_STORE": "1",
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": "12345",
+            },
+        ):
+            pg_init_info = _ProcessGroupInitInfo()
+        self.assertTrue(pg_init_info.use_prefix_store)
+
+        # Missing port
+        with patch.dict(
+            os.environ, {"DCP_USE_PREFIX_STORE": "1", "MASTER_ADDR": "localhost"}
+        ):
+            with self.assertRaises(CheckpointException):
+                pg_init_info = _ProcessGroupInitInfo()
+        # Missing addr
+        with patch.dict(
+            os.environ, {"DCP_USE_PREFIX_STORE": "1", "MASTER_PORT": "12345"}
+        ):
+            with self.assertRaises(CheckpointException):
+                pg_init_info = _ProcessGroupInitInfo()
+        # Invalid port
+        with patch.dict(
+            os.environ,
+            {
+                "DCP_USE_PREFIX_STORE": "1",
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": "a",
+            },
+        ):
+            with self.assertRaises(CheckpointException):
+                pg_init_info = _ProcessGroupInitInfo()
+
+    @with_comms
+    def test_process_group_init_info_without_prefix_store_env_var(self) -> None:
+        """Test that ProcessGroupInitInfo defaults to not using prefix store."""
+
+        # Env var set to 0
+        with patch.dict(os.environ, {"DCP_USE_PREFIX_STORE": "0"}):
+            pg_init_info = _ProcessGroupInitInfo()
+            self.assertFalse(pg_init_info.use_prefix_store)
+
+        # Missing env var
+        with patch.dict(os.environ, {}, clear=False):
+            os.environ.pop("DCP_USE_PREFIX_STORE", None)
+            pg_init_info = _ProcessGroupInitInfo()
+            self.assertFalse(pg_init_info.use_prefix_store)
+
+        # Invalid env var
+        with patch.dict(os.environ, {"DCP_USE_PREFIX_STORE": "2"}):
+            pg_init_info = _ProcessGroupInitInfo()
+            self.assertFalse(pg_init_info.use_prefix_store)
+
+        with patch.dict(os.environ, {"DCP_USE_PREFIX_STORE": "true"}):
+            pg_init_info = _ProcessGroupInitInfo()
+            self.assertFalse(pg_init_info.use_prefix_store)
+
+        with patch.dict(os.environ, {"DCP_USE_PREFIX_STORE": "false"}):
+            pg_init_info = _ProcessGroupInitInfo()
+            self.assertFalse(pg_init_info.use_prefix_store)
+
+        with patch.dict(os.environ, {"DCP_USE_PREFIX_STORE": ""}):
+            pg_init_info = _ProcessGroupInitInfo()
+            self.assertFalse(pg_init_info.use_prefix_store)
 
 
 if __name__ == "__main__":
diff --git a/torch/distributed/checkpoint/_async_process_executor.py b/torch/distributed/checkpoint/_async_process_executor.py
@@ -10,6 +10,7 @@
 
 import torch.distributed as dist
 import torch.multiprocessing as mp
+from torch.distributed import PrefixStore, TCPStore
 from torch.distributed.checkpoint._async_executor import _AsyncCheckpointExecutor
 from torch.distributed.checkpoint.logger import _dcp_method_logger, _init_logger
 from torch.distributed.checkpoint.metadata import Metadata, STATE_DICT_TYPE
@@ -55,15 +56,17 @@ class _ProcessGroupInitInfo:
     world_size: int
     tcp_store_master_addr: str
     tcp_store_master_port: int
+    use_prefix_store: bool
 
     def __init__(self, process_group: Optional[dist.ProcessGroup] = None):
         self.local_rank = dist.get_node_local_rank(fallback_rank=0)
         self.global_rank = dist.get_rank(process_group)
         self.world_size = dist.get_world_size(process_group)
+        self.use_prefix_store = os.environ.get("DCP_USE_PREFIX_STORE", "0") == "1"
 
-        # Let coordinator rank find a free port on the localhost.
-        # Broadcast the (master_addr, free_port) to all ranks; each rank in the
-        # checkpoint daemon process will use TCPStore (master_addr, master_port)
+        # Let coordinator rank find a port on the localhost.
+        # Broadcast the (master_addr, port) to all ranks; each rank in the
+        # checkpoint daemon process will use TCPStore (master_addr, port)
         # for collective communication.
         dist_wrapper: _DistWrapper = _DistWrapper(
             group=process_group,
@@ -72,10 +75,23 @@ def __init__(self, process_group: Optional[dist.ProcessGroup] = None):
         )
 
         def get_master_addr_and_port() -> tuple[str, int]:
-            master_addr = os.environ.get("MASTER_ADDR")
-            if master_addr is None:
-                master_addr = _get_fq_hostname()
-            return master_addr, get_free_port()
+            if self.use_prefix_store:
+                master_addr = os.environ.get("MASTER_ADDR")
+                master_port = os.environ.get("MASTER_PORT")
+                assert master_addr is not None, (
+                    "DCP needs MASTER_ADDR to use prefix store"
+                )
+                assert master_port is not None, (
+                    "DCP needs MASTER_PORT to use prefix store"
+                )
+                master_port = int(master_port)
+            else:
+                master_addr = os.environ.get("MASTER_ADDR")
+                if master_addr is None:
+                    master_addr = _get_fq_hostname()
+                master_port = get_free_port()
+
+            return master_addr, master_port
 
         self.tcp_store_master_addr, self.tcp_store_master_port = dist_wrapper.broadcast(
             step="get_master_addr_and_port",
@@ -221,10 +237,29 @@ def _checkpointing_subprocess(
             os.environ["WORLD_SIZE"] = str(pg_init_info.world_size)
 
             logger.info(
-                "Initializing dist.ProcessGroup in checkpoint background process"
+                "Initializing dist.ProcessGroup in checkpoint background process on port %s",
+                pg_init_info.tcp_store_master_port,
             )
             # NOTE: GLOO backend is enforced here.
-            dist.init_process_group(backend=dist.Backend.GLOO)
+            if pg_init_info.use_prefix_store:
+                logger.info(
+                    "Initializing dist.ProcessGroup in checkpoint background process with prefix store"
+                )
+                store = PrefixStore(
+                    "AsyncCheckpointProcess/",
+                    TCPStore(
+                        pg_init_info.tcp_store_master_addr,
+                        pg_init_info.tcp_store_master_port,
+                    ),
+                )
+                dist.init_process_group(
+                    backend=dist.Backend.GLOO,
+                    store=store,
+                    world_size=pg_init_info.world_size,
+                    rank=pg_init_info.global_rank,
+                )
+            else:
+                dist.init_process_group(backend=dist.Backend.GLOO)
             dist.barrier()
 
             logger.info("Checkpoint background process is running...")
@@ -365,7 +400,7 @@ def execute_save(
         global _CHECKPOINT_PROCESS
         pg_init_info: Optional[_ProcessGroupInitInfo] = None
         if _CHECKPOINT_PROCESS is None:
-            # Find a free port on coordinator rank and broadcast
+            # Find a port on coordinator rank and broadcast
             # to all ranks.
             pg_init_info = _ProcessGroupInitInfo(process_group)