make sync model async

YeAnbang · YeAnbang · commit 880d886e395f · 2025-06-25T09:35:46.000+08:00
diff --git a/applications/ColossalChat/coati/distributed/comm.py b/applications/ColossalChat/coati/distributed/comm.py
@@ -1,5 +1,7 @@
+import copy
 from typing import Any, Dict
 
+import ray
 import ray.util.collective as cc
 import torch
 import torch.distributed.distributed_c10d as c10d
@@ -32,7 +34,11 @@ def ray_broadcast_object(obj: Any, src: int = 0, device=None, group_name: str =
 
 
 def ray_broadcast_tensor_dict(
-    tensor_dict: Dict[str, torch.Tensor], src: int = 0, device=None, group_name: str = "default"
+    tensor_dict: Dict[str, torch.Tensor],
+    src: int = 0,
+    device=None,
+    group_name: str = "default",
+    offload_to_cpu: bool = False,
 ) -> Dict[str, torch.Tensor]:
     rank = cc.get_rank(group_name)
     if rank == src:
@@ -46,12 +52,65 @@ def ray_broadcast_tensor_dict(
         out_dict = {}
     for k, shape, dtype in metadata:
         if rank == src:
-            tensor = tensor_dict[k]
+            if offload_to_cpu:
+                tensor = tensor_dict[k].to(device)
+            else:
+                tensor = tensor_dict[k]
         else:
             tensor = torch.empty(shape, dtype=dtype, device=device)
         cc.broadcast(tensor, src, group_name)
         if rank != src:
-            out_dict[k] = tensor
+            if offload_to_cpu:
+                out_dict[k] = tensor.cpu()
+            else:
+                out_dict[k] = tensor
     if rank == src:
         out_dict = tensor_dict
     return out_dict
+
+
+@ray.remote
+class SharedVariableActor:
+    def __init__(self):
+        # double queues
+        self.data_queue = None
+        self.data_queue_buffered = None
+        self.model_weights = None
+        self.data_access_count = 0
+        self.ready_process_count = {}
+
+    def increase_ready_process_count(self, name):
+        self.ready_process_count = {k: v for k, v in self.ready_process_count.items() if k > name - 5}
+        if name not in self.ready_process_count:
+            self.ready_process_count[name] = 0
+        self.ready_process_count[name] += 1
+
+    def get_ready_process_count(self, name):
+        return self.ready_process_count[name]
+
+    def extend_data(self, data):
+        if self.data_access_count > 0:
+            # update the buffered data if data is not being accessed by all consumers
+            # if producer are too fast, will not overwrite the data but extend the data
+            if self.data_queue_buffered is None:
+                self.data_queue_buffered = []
+            self.data_queue_buffered.extend(data)
+            return True
+        if self.data_queue is None:
+            self.data_queue = []
+        self.data_queue.extend(data)
+        self.data_access_count = 0
+        return True
+
+    def get_data(self):
+        if self.data_queue is None:
+            return None
+        data = copy.deepcopy(self.data_queue)
+        self.data_access_count += 1
+        if self.data_access_count == 4:
+            # data in data_queue has been accessed by all consumers
+            # swap the data queue with the buffered data, erase the old data
+            if self.data_queue_buffered is not None:
+                self.data_queue = self.data_queue_buffered
+                self.data_queue_buffered = None
+        return data
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
@@ -1,4 +1,6 @@
 import os
+import threading
+import time
 from contextlib import nullcontext
 from typing import Any, Dict, Optional
 
@@ -16,13 +18,15 @@
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
 
-from .comm import ray_broadcast_tensor_dict
+from .comm import SharedVariableActor, ray_broadcast_tensor_dict
 from .utils import bind_batch, post_recv, unbind_batch
 
 
 class BaseConsumer:
     def __init__(
         self,
+        shared_sync_data_actor: SharedVariableActor,
+        shared_sync_model_actor: SharedVariableActor,
         num_producers: int,
         num_episodes: int,
         rank: int,
@@ -63,6 +67,13 @@ def __init__(
         self.lr_scheduler = None
         self.n_behind = n_behind
 
+        # for running sync data and model in separate actors/threads
+        self.shared_sync_data_actor = shared_sync_data_actor
+        self.shared_sync_model_actor = shared_sync_model_actor
+        self.thread_started = False
+        self.model_sync_step = 0
+        self.state_dict_cpu = {}
+
     def setup(self) -> None:
         launch(self.rank, self.world_size, self.master_addr, self.master_port, local_rank=0)
 
@@ -85,6 +96,7 @@ def setup(self) -> None:
         self.pp_size = dist.get_world_size(self.plugin.pp_group)
 
         # Init Hybrid ray process group
+        cc.init_collective_group(self.world_size, self.rank, group_name="consumer_pg")
         for i in range(self.num_producers):
             cc.init_collective_group(self.world_size + 1, self.rank + 1, group_name=f"sync_data_{i}")
         if self.pp_size > 1:
@@ -152,44 +164,12 @@ def loop(self) -> None:
                     torch.cuda.reset_peak_memory_stats()
                     i = 0
                     for _ in range(self.num_recv_per_update):
-                        # after sync model, do not wait for more data to arrive as rollout takes time, use buffered data
-                        effective_group_to_raw_group_mapping = self.calculate_effective_group_to_raw_group_mapping()
-                        while len(effective_group_to_raw_group_mapping) > max(
-                            self.dp_size * self.batch_size
-                            - self.dp_size
-                            * self.minibatch_size
-                            * self.grpo_config.get("num_minibatch_during_rollout", 1),
-                            self.dp_size * self.minibatch_size,
-                        ):
-                            self.profiler.log(
-                                f"Still have {len(effective_group_to_raw_group_mapping)} effective groups, greater than {self.dp_size * self.minibatch_size}, start training"
-                            )
-                            batch, raw_mini_batches_metric_dict = self.prepare_mini_batch(
-                                effective_group_to_raw_group_mapping
-                            )
-                            self.profiler.enter("step")
-                            loss = self.step(i, pbar, **batch, **raw_mini_batches_metric_dict)
-                            self.profiler.exit("step")
-                            self.buffer = self.buffer[
-                                effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1 :
-                            ]
-                            # recalculate the effective group to raw group mapping
-                            effective_group_to_raw_group_mapping_size_before = len(effective_group_to_raw_group_mapping)
-                            effective_group_to_raw_group_mapping = self.calculate_effective_group_to_raw_group_mapping()
-                            assert (
-                                len(effective_group_to_raw_group_mapping)
-                                == effective_group_to_raw_group_mapping_size_before - self.dp_size * self.minibatch_size
-                            )
-                            if loss is not None:
-                                pbar.set_postfix({"loss": loss})
-                            i += 1
-
                         # receive data from producers
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
                             self.profiler.enter(f"recv_broadcast_data_P{r}")
                             raw_batch = ray_broadcast_tensor_dict(
-                                None, src=0, device=self.device, group_name=f"sync_data_{r}"
+                                None, src=0, device=self.device, group_name=f"sync_data_{r}", offload_to_cpu=False
                             )
                             self.profiler.exit(f"recv_broadcast_data_P{r}")
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
@@ -238,10 +218,7 @@ def loop(self) -> None:
                             f"[T{dist.get_rank()}] Collect Effective Prompt: {len(effective_group_to_raw_group_mapping)}/{self.dp_size * self.minibatch_size}"
                         )
 
-                        while len(effective_group_to_raw_group_mapping) > self.dp_size * self.batch_size:
-                            self.profiler.log(
-                                f"Received {len(effective_group_to_raw_group_mapping)} effective groups, greater than {self.dp_size * self.batch_size}, start training after recv"
-                            )
+                        while len(effective_group_to_raw_group_mapping) >= self.dp_size * self.minibatch_size:
                             # always keep at least dp_size * batch_size effective samples in the buffer for training during the rollout times after each sync model
                             # on each dp_rank, we use minibatch_size effective samples to form a batch
                             batch, raw_mini_batches_metric_dict = self.prepare_mini_batch(
@@ -273,34 +250,67 @@ def loop(self) -> None:
                         if self.rank == 0:
                             print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")
 
-                    if (episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1) and (
-                        episode != 0 or step >= self.n_behind
-                    ):
+                    if episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1:
                         if self.pp_size > 1:
                             print(
                                 f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {step}"
                             )
                         else:
                             print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
-                        self.profiler.enter("sync_model")
                         torch.cuda.empty_cache()
-                        state_dict = self.state_dict()
+                        self.state_dict_cpu = {k: v.cpu() for k, v in self.state_dict().items()}
+                        cc.barrier(group_name="consumer_pg")
                         if self.pp_size > 1:
                             if self.tp_rank == 0 and self.dp_rank == 0:
+                                self.profiler.enter("sync_model")
                                 ray_broadcast_tensor_dict(
-                                    state_dict,
+                                    self.state_dict_cpu,
                                     src=self.num_producers,
                                     device=self.device,
                                     group_name=f"sync_model_{self.pp_rank}",
+                                    offload_to_cpu=True,
                                 )
+                                self.profiler.exit("sync_model")
                         else:
                             if self.rank == 0:
-                                ray_broadcast_tensor_dict(
-                                    state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
-                                )
-                        del state_dict
-                        torch.cuda.empty_cache()
-                        self.profiler.exit("sync_model")
+                                # ray_broadcast_tensor_dict(
+                                #     self.state_dict_cpu, src=self.num_producers, device=self.device, group_name="sync_model", offload_to_cpu=True
+                                # )
+                                if not self.thread_started:
+
+                                    def broadcast_state_dict():
+                                        self.thread_started = True
+                                        self.profiler.enter("sync_model")
+                                        # lazy broadcast state_dict if and only if both consumer and all producers are idle (not broadcasting the last state_dict)
+                                        ray.get(
+                                            self.shared_sync_model_actor.increase_ready_process_count.remote(
+                                                name=self.model_sync_step
+                                            )
+                                        )
+                                        ready_process_count = ray.get(
+                                            self.shared_sync_model_actor.get_ready_process_count.remote(
+                                                name=self.model_sync_step
+                                            )
+                                        )
+                                        while ready_process_count != self.num_producers + 1:
+                                            time.sleep(0.5)
+                                            ready_process_count = ray.get(
+                                                self.shared_sync_model_actor.get_ready_process_count.remote(
+                                                    name=self.model_sync_step
+                                                )
+                                            )
+                                        ray_broadcast_tensor_dict(
+                                            self.state_dict_cpu,
+                                            src=self.num_producers,
+                                            device=self.device,
+                                            group_name="sync_model",
+                                            offload_to_cpu=True,
+                                        )
+                                        self.model_sync_step += 1
+                                        self.thread_started = False
+                                        self.profiler.exit("sync_model")
+
+                                    threading.Thread(target=broadcast_state_dict, daemon=True).start()
                     self.profiler.log(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
 
     def __del__(self):
@@ -312,6 +322,8 @@ def __del__(self):
 class SimpleConsumer(BaseConsumer):
     def __init__(
         self,
+        shared_sync_data_actor: SharedVariableActor,
+        shared_sync_model_actor: SharedVariableActor,
         num_producers,
         num_episodes,
         rank,
@@ -328,6 +340,8 @@ def __init__(
         save_dir="./model",
     ):
         super().__init__(
+            shared_sync_data_actor,
+            shared_sync_model_actor,
             num_producers,
             num_episodes,
             rank,
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -4,6 +4,7 @@
 import ray
 import torch
 import wandb
+from coati.distributed.comm import SharedVariableActor
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
 from coati.distributed.utils import memory_efficient_logprob
@@ -18,6 +19,8 @@
 class GRPOConsumer(BaseConsumer):
     def __init__(
         self,
+        shared_sync_data_actor: SharedVariableActor,
+        shared_sync_model_actor: SharedVariableActor,
         num_producers,
         num_episodes,
         rank,
@@ -51,6 +54,8 @@ def __init__(
                 1, grpo_config.get("train_microbatch_size") // plugin_config.get("pp_size", 1)
             )
         super().__init__(
+            shared_sync_data_actor,
+            shared_sync_model_actor,
             num_producers,
             num_episodes,
             rank,
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
@@ -5,6 +5,7 @@
 
 import ray
 
+from .comm import SharedVariableActor
 from .consumer import SimpleConsumer
 from .grpo_consumer import GRPOConsumer
 from .producer import SimpleProducer
@@ -87,6 +88,10 @@ def launch_distributed(
     # allocating the producer to nodes with lower node id and the consumer to the resouces from nodes with higher
     # node id. See the reference here: https://docs.ray.io/en/latest/ray-core/scheduling/index.html#nodeaffinityschedulingstrategy
     nodes = ray.nodes()
+
+    shared_sync_data_actor = SharedVariableActor.remote()
+    shared_sync_model_actor = SharedVariableActor.remote()
+
     node_info = {
         node["NodeID"]: {
             "num_gpus": node["Resources"].get("GPU", 0),
@@ -111,6 +116,8 @@ def launch_distributed(
             gpu_to_ip_address.pop(0)
         print(f"Schedual Producer P[{i}] which requires {num_proc_per_producer} GPUs on node {producer_ip_address}")
         producer = SimpleProducer.options(num_gpus=num_proc_per_producer).remote(
+            shared_sync_data_actor=shared_sync_data_actor,
+            shared_sync_model_actor=shared_sync_model_actor,
             producer_idx=i,
             num_producers=num_producers,
             num_consumer_procs=num_consumer_procs,
@@ -155,6 +162,8 @@ def launch_distributed(
         gpu_to_ip_address.pop(0)
         print(f"Schedual Consumer T[{i}] which requires 1 GPUs on node {consumer_ip_address}")
         consumer = core_consumer.options(num_gpus=1).remote(
+            shared_sync_data_actor=shared_sync_data_actor,
+            shared_sync_model_actor=shared_sync_model_actor,
             num_producers=num_producers,
             num_episodes=num_episodes,
             rank=i,
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
diff --git a/applications/ColossalChat/profiling.sh b/applications/ColossalChat/profiling.sh