hpcaitech
diff --git a/‎applications/ColossalChat/coati/distributed/comm.py‎
Lines changed: 45 additions & 12 deletions b/‎applications/ColossalChat/coati/distributed/comm.py‎
Lines changed: 45 additions & 12 deletions
diff --git a/‎applications/ColossalChat/coati/distributed/consumer.py‎
Lines changed: 68 additions & 40 deletions b/‎applications/ColossalChat/coati/distributed/consumer.py‎
Lines changed: 68 additions & 40 deletions
diff --git a/‎applications/ColossalChat/coati/distributed/distributor.py‎
Lines changed: 15 additions & 43 deletions b/‎applications/ColossalChat/coati/distributed/distributor.py‎
Lines changed: 15 additions & 43 deletions
diff --git a/‎applications/ColossalChat/coati/distributed/grpo_consumer.py‎
Lines changed: 0 additions & 1 deletion b/‎applications/ColossalChat/coati/distributed/grpo_consumer.py‎
Lines changed: 0 additions & 1 deletion
@@ -40,25 +40,26 @@ def ray_broadcast_tensor_dict(
     group_name: str = "default",
     backend: str = "nccl",
     offload_to_cpu: bool = False,
+    pin_memory: bool = False,
 ) -> Dict[str, torch.Tensor]:
     rank = cc.get_rank(group_name)
+    if tensor_dict is None:
+        tensor_dict = {}
     if rank == src:
         metadata = []
         for k, v in tensor_dict.items():
             metadata.append((k, v.shape, v.dtype))
     else:
         metadata = None
     metadata = ray_broadcast_object(metadata, src, device, group_name)
-    if rank != src:
-        out_dict = {}
     for k, shape, dtype in metadata:
         if rank == src:
             if offload_to_cpu:
                 tensor = tensor_dict[k].to(device)
             else:
                 tensor = tensor_dict[k]
         else:
-            tensor = torch.empty(shape, dtype=dtype, device=device)
+            tensor = tensor_dict.get(k, torch.zeros(shape, dtype=dtype, device=device, pin_memory=pin_memory))
         if backend == "gloo" and dtype == torch.bfloat16:
             # Gloo does not support bfloat16, convert to float16
             tensor = tensor.view(torch.float16)
@@ -68,26 +69,41 @@ def ray_broadcast_tensor_dict(
             tensor = tensor.view(torch.bfloat16)
         if rank != src:
             if offload_to_cpu:
-                out_dict[k] = tensor.cpu()
+                tensor_dict[k] = tensor.cpu()
             else:
-                out_dict[k] = tensor
-    if rank == src:
-        out_dict = tensor_dict
-    return out_dict
+                tensor_dict[k] = tensor
+    return tensor_dict
 
 
 @ray.remote
 class SharedVariableActor:
-    def __init__(self, number_of_readers: int = 1):
+    def __init__(self, number_of_readers: int = 0, buffer_size_limit: int = 1000):
         self.data_queue = []
         self.data_uid = 0
         self.number_of_readers = number_of_readers
+        self.queue_size = 0
         self.signals = {}
+        self.process_locks = {}
         self.signal_procs_meet_count = {}
+        self.buffer_size_limit = buffer_size_limit
 
-    def get_queued_data_size(self):
-        queued_data_size = sum([data[1]["input_ids"].size(0) for data in self.data_queue])
-        return queued_data_size
+    def pickup_rollout_task(self, num_tasks: int):
+        """
+        use queue size to control whether producers should generating new rollouts or wait
+        for consumer to consumer more data. if queue size is less than threshold,
+        it means consumer is consuming data fast enough, so producers can generate new rollouts.
+        if queue size is greater than threshold, it means consumer is consuming data slowly,
+        so producers should wait for consumer to consume more data.
+
+        Any free producer can pick up the task to generate rollout then increase the queued_data_size
+        to prevent other producer to pick up the task redundantly, Note it is not the real
+        queue length as data may still be generating
+        """
+        ret = False
+        if self.queue_size < self.buffer_size_limit:
+            ret = True
+            self.queue_size += num_tasks
+        return ret
 
     def append_data(self, data):
         self.data_queue.append([self.data_uid, data, 0])  # [data_uid, data, access_count]
@@ -112,8 +128,25 @@ def get_data(self, data_uid: int):
         if to_pop_index is not None:
             # remove the data from the queue if it has been accessed by all readers
             self.data_queue.pop(to_pop_index)
+            self.queue_size -= data["input_ids"].size(0)
         return ret
 
+    def acquire_process_lock(self, key: str):
+        # atomic lock for process
+        if key not in self.process_locks:
+            self.process_locks[key] = 1  # locked
+            return 0
+        if self.process_locks[key] == 0:
+            self.process_locks[key] = 1  # lock the process
+            return 0
+        else:
+            return 1
+
+    def release_process_lock(self, key: str):
+        # atomic unlock for process
+        assert self.process_locks.get(key, 0) == 1, f"Releasing a process lock {key} that is not locked."
+        self.process_locks[key] = 0
+
     def set_signal(self, key: str, signal: str):
         self.signals[key] = signal
 
 
@@ -1,4 +1,5 @@
 import os
+import threading
 import time
 from typing import Any, Dict, Optional
 
@@ -54,6 +55,7 @@ def __init__(
         assert batch_size % minibatch_size == 0, "batch_size should be divisible by microbatch_size"
         self.num_microbatches = batch_size // minibatch_size
         self.data_uid = 0
+        self.sync_model_thread_started = False
 
         self.model_config = model_config
         self.plugin_config = plugin_config
@@ -64,7 +66,6 @@ def __init__(
         self.shared_sync_data_actor = shared_sync_data_actor
         self.shared_signal_actor = shared_signal_actor
         self.state_dict_cpu = {}
-        self.next_data_source = 0  # used to track which producer to get data from next
 
     def setup(self) -> None:
         launch(self.rank, self.world_size, self.master_addr, self.master_port, local_rank=0)
@@ -183,7 +184,6 @@ def loop(self) -> None:
                             raw_batch = ray.get(self.shared_sync_data_actor.get_data.remote(self.data_uid))
                             continue
                         self.data_uid += 1
-                        self.next_data_source = (self.next_data_source + 1) % self.num_producers
                         raw_batch = {k: v.to(self.device) for k, v in raw_batch.items()}
                         # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                         # we need to calculate the metrics before filtering here for logging
@@ -253,6 +253,7 @@ def loop(self) -> None:
                         if loss is not None:
                             pbar.set_postfix({"loss": loss})
                             need_sync_model = True
+                            ray.get(self.shared_signal_actor.set_signal.remote("global_step", self.global_step + 1))
                     if need_sync_model and (
                         (self.global_step + 1) % self.save_interval == 0
                         or self.received_prompts >= self.train_dataset_size
@@ -269,49 +270,76 @@ def loop(self) -> None:
                     if need_sync_model and (
                         episode != self.num_episodes - 1 or self.received_prompts != self.train_dataset_size
                     ):
-                        # sync model weights to all producers, if no model update or it is the last training step, skip syncing
-                        if self.pp_size > 1:
-                            print(
-                                f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {self.global_step}"
-                            )
-                        else:
-                            print(f"[T{dist.get_rank()}] Sync model episode {episode} step {self.global_step}")
-                        torch.cuda.empty_cache()
-                        self.state_dict_cpu = {k: v.cpu() for k, v in self.state_dict().items()}
-                        cc.barrier(group_name="consumer_pg")
-                        if self.pp_size > 1:
-                            if self.tp_rank == 0 and self.dp_rank == 0:
-                                self.profiler.enter("sync_model")
-                                ray.get(
-                                    self.shared_signal_actor.set_signal.remote(
-                                        f"consumer_pp_{self.pp_rank}", "ready_sync_model"
-                                    )
-                                )
+
+                        def sync_model_thread():
+                            # sync model weights to all producers, if no model update or it is the last training step, skip syncing
+                            if self.pp_size > 1:
                                 print(
                                     f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {self.global_step}"
                                 )
-                                ray_broadcast_tensor_dict(
-                                    self.state_dict_cpu,
-                                    src=0,
-                                    device=torch.device("cpu"),
-                                    group_name=f"sync_model_consumer_pp_{self.pp_rank}",
-                                    backend="gloo",
-                                )
-                                self.profiler.exit("sync_model")
-                        else:
-                            if self.rank == 0:
-                                self.profiler.enter("sync_model")
-                                ray.get(self.shared_signal_actor.set_signal.remote("consumer", "ready_sync_model"))
+                            else:
                                 print(f"[T{dist.get_rank()}] Sync model episode {episode} step {self.global_step}")
-                                ray_broadcast_tensor_dict(
-                                    self.state_dict_cpu,
-                                    src=0,
-                                    device=torch.device("cpu"),
-                                    group_name="sync_model_consumer",
-                                    backend="gloo",
-                                )
-                                self.profiler.exit("sync_model")
+                            torch.cuda.empty_cache()
+                            if self.pp_size > 1:
+                                if self.tp_rank == 0 and self.dp_rank == 0:
+                                    self.profiler.enter("sync_model")
+                                    ray.get(
+                                        self.shared_signal_actor.set_signal.remote(
+                                            f"consumer_pp_{self.pp_rank}", "ready_sync_model"
+                                        )
+                                    )
+                                    print(
+                                        f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {self.global_step}"
+                                    )
+                                    ray_broadcast_tensor_dict(
+                                        self.state_dict_cpu,
+                                        src=0,
+                                        device=torch.device("cpu"),
+                                        group_name=f"sync_model_consumer_pp_{self.pp_rank}",
+                                        backend="gloo",
+                                    )
+                                    self.profiler.exit("sync_model")
+                            else:
+                                if self.rank == 0:
+                                    self.profiler.enter("sync_model")
+                                    ray.get(self.shared_signal_actor.set_signal.remote("consumer", "ready_sync_model"))
+                                    print(f"[T{dist.get_rank()}] Sync model episode {episode} step {self.global_step}")
+                                    ray_broadcast_tensor_dict(
+                                        self.state_dict_cpu,
+                                        src=0,
+                                        device=torch.device("cpu"),
+                                        group_name="sync_model_consumer",
+                                        backend="gloo",
+                                    )
+                                    self.profiler.exit("sync_model")
+
+                        if not self.sync_model_thread_started:
+                            # only sync model when the thread is not started and no other thread is broadcasting
+                            self.sync_model_thread_started = True
+                            state_dict_ = self.state_dict()
+                            if (self.pp_size > 1 and self.tp_rank == 0 and self.dp_rank == 0) or (
+                                self.pp_size == 1 and self.rank == 0
+                            ):
+                                if len(self.state_dict_cpu) == 0:
+                                    # use pinned memory to speed up the transfer
+                                    self.state_dict_cpu = {k: v.cpu().pin_memory() for k, v in state_dict_.items()}
+                                    torch.cuda.synchronize()
+                                for k, v in state_dict_.items():
+                                    self.state_dict_cpu[k].copy_(v, non_blocking=True)
+                                torch.cuda.synchronize()
+                            cc.barrier(
+                                group_name="consumer_pg"
+                            )  # to make sure all ranks have state dict offloaded to CPU before starting the thread
+                            time_before_starting_thread = time.time()
+                            threading.Thread(target=sync_model_thread).start()
+                            # sync_model_thread()
+                            self.profiler.log(
+                                f"Sync model, took {time.time() - time_before_starting_thread:.2f} seconds"
+                            )
+                            self.sync_model_thread_started = False
+                            # ray.get(self.shared_signal_actor.release_process_lock.remote("broadcasting_lock"))
                     self.profiler.log(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+                self.received_prompts = 0
         ray.get(self.shared_signal_actor.set_signal.remote("consumer", "terminate"))
 
     def __del__(self):
 
@@ -22,15 +22,11 @@ def __init__(
     ):
         self.distributor_id = distributor_id
         self.consumer_pp_size = consumer_pp_size
-        self.state_dict_cpu = {i: {"not_ready_sync_model": torch.ones((1)).cpu()} for i in range(self.consumer_pp_size)}
+        self.state_dict_cpu = {}
         self.num_producers = num_producers
         self.shared_signal_actor = shared_signal_actor
         self.device = get_current_device()
         self.profiler = CustomProfiler(f"D{self.distributor_id}", disabled=not enable_profiling)
-        self.weight_version = {i: 0 for i in range(self.consumer_pp_size)}
-        self.producer_weight_version = {
-            j: {f"producer_{i}": 0 for i in range(self.num_producers)} for j in range(self.consumer_pp_size)
-        }
 
     def init_collective_group(
         self,
@@ -64,7 +60,6 @@ def loop(self):
                             backend="gloo",
                         )
                         self.profiler.exit(f"sync_model_consumer_pp_{i}")
-                        self.weight_version[i] += 1
                 for i in range(self.consumer_pp_size):
                     if signal.get(f"producer_{self.distributor_id}_pp_{i}", None) == "ready_sync_model":
                         self.profiler.enter(f"sync_model_producer_{self.distributor_id}_pp_{i}")
@@ -74,24 +69,13 @@ def loop(self):
                                 f"producer_{self.distributor_id}_pp_{i}", "not_ready_sync_model"
                             )
                         )
-                        if self.producer_weight_version[i][f"producer_{self.distributor_id}"] < self.weight_version[i]:
-                            self.producer_weight_version[i][f"producer_{self.distributor_id}"] = self.weight_version[i]
-                            ray_broadcast_tensor_dict(
-                                self.state_dict_cpu[i],
-                                1,
-                                device=torch.device("cpu"),
-                                group_name=f"sync_model_producer_{self.distributor_id}_pp_{i}",
-                                backend="gloo",
-                            )
-                        else:
-                            # broadcast a dummy tensor to save the communication cost
-                            ray_broadcast_tensor_dict(
-                                {"not_ready_sync_model": torch.ones((1)).cpu()},
-                                1,
-                                device=torch.device("cpu"),
-                                group_name=f"sync_model_producer_{self.distributor_id}_pp_{i}",
-                                backend="gloo",
-                            )
+                        ray_broadcast_tensor_dict(
+                            self.state_dict_cpu[i],
+                            1,
+                            device=torch.device("cpu"),
+                            group_name=f"sync_model_producer_{self.distributor_id}_pp_{i}",
+                            backend="gloo",
+                        )
                         self.profiler.exit(f"sync_model_producer_{self.distributor_id}_pp_{i}")
             else:
                 if signal.get("consumer", None) == "ready_sync_model":
@@ -103,7 +87,6 @@ def loop(self):
                         None, 0, device=torch.device("cpu"), group_name="sync_model_consumer", backend="gloo"
                     )
                     self.profiler.exit("sync_model_consumer")
-                    self.weight_version[0] += 1
                 if signal.get(f"producer_{self.distributor_id}", None) == "ready_sync_model":
                     self.profiler.enter(f"sync_model_producer_{self.distributor_id}")
                     # Broadcast the model state dict to all producers
@@ -112,24 +95,13 @@ def loop(self):
                             f"producer_{self.distributor_id}", "not_ready_sync_model"
                         )
                     )
-                    if self.producer_weight_version[0][f"producer_{self.distributor_id}"] < self.weight_version[0]:
-                        self.producer_weight_version[0][f"producer_{self.distributor_id}"] = self.weight_version[0]
-                        ray_broadcast_tensor_dict(
-                            self.state_dict_cpu,
-                            1,
-                            device=torch.device("cpu"),
-                            group_name=f"sync_model_producer_{self.distributor_id}",
-                            backend="gloo",
-                        )
-                    else:
-                        # broadcast a dummy tensor to save the communication cost
-                        ray_broadcast_tensor_dict(
-                            {"not_ready_sync_model": torch.ones((1)).cpu()},
-                            1,
-                            device=torch.device("cpu"),
-                            group_name=f"sync_model_producer_{self.distributor_id}",
-                            backend="gloo",
-                        )
+                    ray_broadcast_tensor_dict(
+                        self.state_dict_cpu,
+                        1,
+                        device=torch.device("cpu"),
+                        group_name=f"sync_model_producer_{self.distributor_id}",
+                        backend="gloo",
+                    )
                     self.profiler.exit(f"sync_model_producer_{self.distributor_id}")
             if signal.get("consumer", None) == "terminate":
                 self.profiler.log("terminate sync model worker")
 
@@ -495,5 +495,4 @@ def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()
         state_dict = model.state_dict()
-        state_dict["consumer_global_step"] = torch.tensor([self.global_step], device=self.device)
         return state_dict