hpcaitech
diff --git a/‎applications/ColossalChat/coati/distributed/comm.py‎
Lines changed: 28 additions & 138 deletions b/‎applications/ColossalChat/coati/distributed/comm.py‎
Lines changed: 28 additions & 138 deletions
@@ -1,16 +1,12 @@
 import copy
-import time
 from typing import Any, Dict
 
 import ray
 import ray.util.collective as cc
 import torch
 import torch.distributed.distributed_c10d as c10d
-from coati.distributed.profiling_utils import CustomProfiler
 from packaging.version import Version
 
-from colossalai.utils import get_current_device
-
 
 def ray_broadcast_object(obj: Any, src: int = 0, device=None, group_name: str = "default") -> Any:
     rank = cc.get_rank(group_name)
@@ -42,6 +38,7 @@ def ray_broadcast_tensor_dict(
     src: int = 0,
     device=None,
     group_name: str = "default",
+    backend: str = "nccl",
     offload_to_cpu: bool = False,
 ) -> Dict[str, torch.Tensor]:
     rank = cc.get_rank(group_name)
@@ -62,7 +59,13 @@ def ray_broadcast_tensor_dict(
                 tensor = tensor_dict[k]
         else:
             tensor = torch.empty(shape, dtype=dtype, device=device)
+        if backend == "gloo" and dtype == torch.bfloat16:
+            # Gloo does not support bfloat16, convert to float16
+            tensor = tensor.view(torch.float16)
         cc.broadcast(tensor, src, group_name)
+        if backend == "gloo" and dtype == torch.bfloat16:
+            # Convert back to bfloat16 if it was converted to float16
+            tensor = tensor.view(torch.bfloat16)
         if rank != src:
             if offload_to_cpu:
                 out_dict[k] = tensor.cpu()
@@ -77,155 +80,42 @@ def ray_broadcast_tensor_dict(
 class SharedVariableActor:
     def __init__(self, number_of_readers: int = 1):
         self.data_queue = []
-        self.model_weights = None
-        self.data_access_count = 0
-        self.ready_process_count = {}
+        self.data_uid = 0
         self.number_of_readers = number_of_readers
-        self.consumer_buffer_size = 0
         self.signals = {}
+        self.signal_procs_meet_count = {}
 
     def get_queued_data_size(self):
-        queued_data_size = sum([data["input_ids"].size(0) for data in self.data_queue])
+        queued_data_size = sum([data[1]["input_ids"].size(0) for data in self.data_queue])
         return queued_data_size
 
     def append_data(self, data):
-        self.data_queue.append(data)
+        self.data_queue.append([self.data_uid, data, 0])  # [data_uid, data, access_count]
+        self.data_uid += 1
         return True
 
-    def get_data(self):
+    def get_data(self, data_uid: int):
+        # for multi-process data reading
         if not self.data_queue:
             # no data in the queue, return None
             return None
-        data = copy.deepcopy(self.data_queue[0])
-        self.data_access_count += 1
-        if self.data_access_count == self.number_of_readers:
-            # first data in data_queue has been accessed by all consumers
-            # remove it from the queue
-            self.data_queue.pop(0)
-            self.data_access_count = 0
-        return data
+        to_pop_index = None
+        ret = None
+        for i, (uid, data, access_count) in enumerate(self.data_queue):
+            if uid == data_uid:
+                # found the data with the given uid
+                self.data_queue[i][2] += 1
+                ret = copy.deepcopy(data)
+                if self.data_queue[i][2] == self.number_of_readers:
+                    to_pop_index = i
+                break
+        if to_pop_index is not None:
+            # remove the data from the queue if it has been accessed by all readers
+            self.data_queue.pop(to_pop_index)
+        return ret
 
     def set_signal(self, key: str, signal: str):
         self.signals[key] = signal
 
     def get_signal(self):
         return self.signals
-
-
-@ray.remote
-class SharedVariableActorNCCL:
-    def __init__(
-        self, consumer_pp_size, num_producers, shared_signal_actor: SharedVariableActor, enable_profiling: bool = True
-    ):
-        self.consumer_pp_size = consumer_pp_size
-        self.state_dict_cpu = {i: {"not_ready_sync_model": torch.ones((1)).cpu()} for i in range(self.consumer_pp_size)}
-        self.num_producers = num_producers
-        self.shared_signal_actor = shared_signal_actor
-        self.device = get_current_device()
-        self.profiler = CustomProfiler(f"D", disabled=not enable_profiling)
-        self.weight_version = {i: 0 for i in range(self.consumer_pp_size)}
-        self.producer_weight_version = {
-            j: {f"producer_{i}": 0 for i in range(self.num_producers)} for j in range(self.consumer_pp_size)
-        }
-
-    def setup(self):
-        if self.consumer_pp_size == 1:
-            cc.init_collective_group(2, 1, group_name="sync_model_consumer")
-            for i in range(self.num_producers):
-                cc.init_collective_group(2, 1, group_name=f"sync_model_producer_{i}")
-        else:
-            for i in range(self.consumer_pp_size):
-                cc.init_collective_group(2, 1, group_name=f"sync_model_consumer_pp_{i}")
-            for i in range(self.num_producers):
-                for j in range(self.consumer_pp_size):
-                    cc.init_collective_group(2, 1, group_name=f"sync_model_producer_{i}_pp_{j}")
-
-    def loop(self):
-        while True:
-            time.sleep(1)
-            signal = ray.get(self.shared_signal_actor.get_signal.remote())
-            if self.consumer_pp_size > 1:
-                for i in range(self.consumer_pp_size):
-                    if signal.get(f"consumer_pp_{i}", None) == "ready_sync_model":
-                        self.profiler.enter(f"sync_model_consumer_pp_{i}")
-                        ray.get(self.shared_signal_actor.set_signal.remote(f"consumer_pp_{i}", "not_ready_sync_model"))
-                        # Broadcast the model state dict from consumer to shared variable actor
-                        self.state_dict_cpu[i] = ray_broadcast_tensor_dict(
-                            None,
-                            0,
-                            device=self.device,
-                            group_name=f"sync_model_consumer_pp_{i}",
-                            offload_to_cpu=True,
-                        )
-                        self.profiler.exit(f"sync_model_consumer_pp_{i}")
-                        self.weight_version[i] += 1
-                for j in range(self.num_producers):
-                    for i in range(self.consumer_pp_size):
-                        if signal.get(f"producer_{j}_pp_{i}", None) == "ready_sync_model":
-                            self.profiler.enter(f"sync_model_producer_{j}_pp_{i}")
-                            # Broadcast the model state dict to all producers
-                            ray.get(
-                                self.shared_signal_actor.set_signal.remote(
-                                    f"producer_{j}_pp_{i}", "not_ready_sync_model"
-                                )
-                            )
-                            if self.producer_weight_version[i][f"producer_{j}"] < self.weight_version[i]:
-                                self.producer_weight_version[i][f"producer_{j}"] = self.weight_version[i]
-                                ray_broadcast_tensor_dict(
-                                    self.state_dict_cpu[i],
-                                    1,
-                                    device=self.device,
-                                    group_name=f"sync_model_producer_{j}_pp_{i}",
-                                    offload_to_cpu=True,
-                                )
-                            else:
-                                # broadcast a dummy tensor to save the communication cost
-                                ray_broadcast_tensor_dict(
-                                    {"not_ready_sync_model": torch.ones((1)).cpu()},
-                                    1,
-                                    device=self.device,
-                                    group_name=f"sync_model_producer_{j}_pp_{i}",
-                                    offload_to_cpu=True,
-                                )
-                            self.profiler.exit(f"sync_model_producer_{j}_pp_{i}")
-            else:
-                if signal.get("consumer", None) == "ready_sync_model":
-                    self.profiler.enter("sync_model_consumer")
-                    ray.get(self.shared_signal_actor.set_signal.remote("consumer", "not_ready_sync_model"))
-                    # Broadcast the model state dict from consumer to shared variable actor
-                    self.state_dict_cpu = ray_broadcast_tensor_dict(
-                        None,
-                        0,
-                        device=self.device,
-                        group_name="sync_model_consumer",
-                        offload_to_cpu=True,
-                    )
-                    self.profiler.exit("sync_model_consumer")
-                    self.weight_version[0] += 1
-                for i in range(self.num_producers):
-                    if signal.get(f"producer_{i}", None) == "ready_sync_model":
-                        self.profiler.enter(f"sync_model_producer_{i}")
-                        # Broadcast the model state dict to all producers
-                        ray.get(self.shared_signal_actor.set_signal.remote(f"producer_{i}", "not_ready_sync_model"))
-                        if self.producer_weight_version[0][f"producer_{i}"] < self.weight_version[0]:
-                            self.producer_weight_version[0][f"producer_{i}"] = self.weight_version[0]
-                            ray_broadcast_tensor_dict(
-                                self.state_dict_cpu,
-                                1,
-                                device=self.device,
-                                group_name=f"sync_model_producer_{i}",
-                                offload_to_cpu=True,
-                            )
-                        else:
-                            # broadcast a dummy tensor to save the communication cost
-                            ray_broadcast_tensor_dict(
-                                {"not_ready_sync_model": torch.ones((1)).cpu()},
-                                1,
-                                device=self.device,
-                                group_name=f"sync_model_producer_{i}",
-                                offload_to_cpu=True,
-                            )
-                        self.profiler.exit(f"sync_model_producer_{i}")
-            if signal.get("consumer", None) == "terminate":
-                self.profiler.log("terminate sync model worker")
-                break