Moved _compute_one_replica_ids from DistributedArray to PhysicalDeviceMesh (#914)

akhmedsakip · web-flow · commit f7f933875f69 · 2023-04-15T12:20:17.000-04:00
diff --git a/alpa/device_mesh.py b/alpa/device_mesh.py
@@ -638,6 +638,7 @@ class PhysicalDeviceMesh(ABC):
     num_devices_per_host: int
     mesh_id: int
     operation_executables: dict
+    one_replica_ids: dict
 
     def get_signature(self) -> str:
         """Return a signature string that contains the mesh shape and GPU
@@ -648,6 +649,27 @@ def get_signature(self) -> str:
         ret = ret.replace(" ", "-")
         return ret
 
+    def _compute_one_replica_ids(self, indices, aval_shape, sharding_spec):
+        # Tuple (aval_shape, sharding_spec) is 1-1 mapped to indices
+        # used to compute one_replica_ids
+        if (aval_shape, sharding_spec) in self.one_replica_ids:
+            return self.one_replica_ids[(aval_shape, sharding_spec)]
+
+        one_replica_indices = []
+        one_replica_host_local_ids = []
+        seen_index_hashes = set()
+        for i, index in enumerate(indices):
+            hashed_index = _hashable_index(index)
+            if hashed_index not in seen_index_hashes:
+                one_replica_indices.append(i)
+                one_replica_host_local_ids.append(
+                    divmod(i, self.num_devices_per_host))
+                seen_index_hashes.add(hashed_index)
+        self.one_replica_ids[(
+            aval_shape,
+            sharding_spec)] = one_replica_indices, one_replica_host_local_ids
+        return one_replica_indices, one_replica_host_local_ids
+
     @property
     def shape(self):
         return self.num_hosts, self.num_devices_per_host
@@ -845,6 +867,7 @@ def __init__(self, devices: Sequence["Device"] = None):
         self.mesh_id = -1
         self.device_strs = []
         self.operation_executables = {}
+        self.one_replica_ids = {}
 
         self.backend = xb.get_backend(global_config.backend)
 
@@ -974,6 +997,7 @@ def __init__(self,
         self.workers = None
         self.service_server = None
         self.operation_executables = {}
+        self.one_replica_ids = {}
         self.namespace = namespace
 
         if devices is not None:
@@ -1508,8 +1532,6 @@ def __init__(self,
         self.shape = self.aval.shape
         self.dtype = self.aval.dtype
         self._npy_value = None
-        self._one_replica_host_local_ids = None
-        self._one_replica_buffer_ids = None
         self._fetched_np_buffers = None
         self._fetched_np_buffers_ref = None
         self.skip_shard_args_check = False
@@ -1616,34 +1638,16 @@ def load(cls, path: str, aval: ShapedArray, device_mesh: PhysicalDeviceMesh,
         return DistributedArray(device_mesh, aval, sharding_spec, ary_ref,
                                 indices)
 
-    def _compute_one_replica_ids(self):
-        one_replica_indices = []
-        one_replica_host_local_ids = []
-        seen_index_hashes = set()
-        for i, index in enumerate(self.indices):
-            hashed_index = _hashable_index(index)
-            if hashed_index not in seen_index_hashes:
-                one_replica_indices.append(i)
-                one_replica_host_local_ids.append(
-                    divmod(i, self.device_mesh.num_devices_per_host))
-                seen_index_hashes.add(hashed_index)
-        self._one_replica_buffer_ids = one_replica_indices
-        self._one_replica_host_local_ids = one_replica_host_local_ids
-
-    # TODO(yonghao): to make ._value faster(in reorder buffer), cache different
-    # buffers with the same mesh shape and sharding spec.
     @property
     def one_replica_buffer_ids(self):
         """Indices of buffers containing one complete copy of the array data."""
-        if self._one_replica_buffer_ids is None:
-            self._compute_one_replica_ids()
-        return self._one_replica_buffer_ids
+        return self.device_mesh._compute_one_replica_ids(
+            self.indices, self.aval.shape, self.sharding_spec)[0]
 
     @property
     def one_replica_host_local_ids(self):
-        if self._one_replica_host_local_ids is None:
-            self._compute_one_replica_ids()
-        return self._one_replica_host_local_ids
+        return self.device_mesh._compute_one_replica_ids(
+            self.indices, self.aval.shape, self.sharding_spec)[1]
 
     @property
     def _value(self):