Fix type annotation (#29)

AdrienVannson · web-flow · commit 5237e0633867 · 2025-06-26T16:54:24.000+02:00
* Fix type annotation

* First refactoring

* Fix bug

* Refactor scheduling actor

* Remove useless variable

* Remove useless parameter

* Add comment

* Remove useless field

* Bump to 0.2.0
diff --git a/doreisa/_scheduling_actor.py b/doreisa/_scheduling_actor.py
@@ -48,19 +48,6 @@ def __init__(self):
         self.refs: dict[str, ray.ObjectRef] = {}
 
 
-@dataclass
-class ChunkReadyInfo:
-    # Information about the array
-    array_name: str
-    timestep: Timestep
-    dtype: np.dtype
-    nb_chunks_per_dim: tuple[int, ...]
-
-    # Information about the chunk
-    position: tuple[int, ...]
-    size: tuple[int, ...]
-
-
 @ray.remote(num_cpus=0, enable_task_events=False)
 def patched_dask_task_wrapper(func, repack, key, ray_pretask_cbs, ray_posttask_cbs, *args, first_call=True):
     """
@@ -105,6 +92,27 @@ def remote_ray_dask_get(dsk, keys):
     return ray.util.dask.ray_dask_get(dsk, keys, ray_persist=True)
 
 
+class _ArrayTimestep:
+    def __init__(self):
+        # Triggered when all the chunks are ready
+        self.chunks_ready_event: asyncio.Event = asyncio.Event()
+
+        # {position: chunk}
+        self.local_chunks: dict[tuple[int, ...], ray.ObjectRef | bytes] = {}
+
+
+class _Array:
+    def __init__(self):
+        # Indicates if set_owned_chunks method has been called for this array.
+        self.is_registered = False
+
+        # Chunks owned by this actor for this array.
+        # {(chunk position, chunk size), ...}
+        self.owned_chunks: set[tuple[tuple[int, ...], tuple[int, ...]]] = set()
+
+        self.timesteps: dict[Timestep, _ArrayTimestep] = {}
+
+
 @ray.remote
 class SchedulingActor:
     """
@@ -119,15 +127,7 @@ def __init__(self, actor_id: int) -> None:
         self.scheduling_actors: list[ray.actor.ActorHandle] = []
 
         # For collecting chunks
-
-        # Triggered when all the chunks are ready
-        self.chunks_ready_event = asyncio.Event()
-
-        self.chunks_info: dict[str, list[ChunkReadyInfo]] = {}
-
-        # (dask_array_name, position) -> chunk
-        # The Dask array name contains the timestep
-        self.local_chunks: dict[tuple[str, Timestep, tuple[int, ...]], ray.ObjectRef | bytes] = {}
+        self.arrays: dict[str, _Array] = {}
 
         # For scheduling
         self.new_graph_available = asyncio.Event()
@@ -158,43 +158,48 @@ async def add_chunk(
         chunk: list[ray.ObjectRef],
         chunk_shape: tuple[int, ...],
     ) -> None:
-        assert (array_name, timestep, chunk_position) not in self.local_chunks
-
-        self.local_chunks[(array_name, timestep, chunk_position)] = self.actor_handle._pack_object_ref.remote(chunk)
-
-        if array_name not in self.chunks_info:
-            self.chunks_info[array_name] = []
-        chunks_info = self.chunks_info[array_name]
-
-        chunks_info.append(
-            ChunkReadyInfo(
-                array_name=array_name,
-                timestep=timestep,
-                dtype=dtype,
-                nb_chunks_per_dim=nb_chunks_per_dim,
-                position=chunk_position,
-                size=chunk_shape,
-            )
-        )
+        if array_name not in self.arrays:
+            self.arrays[array_name] = _Array()
+        array = self.arrays[array_name]
+
+        if timestep not in array.timesteps:
+            array.timesteps[timestep] = _ArrayTimestep()
+        array_timestep = array.timesteps[timestep]
+
+        assert chunk_position not in array_timestep.local_chunks
+        array_timestep.local_chunks[chunk_position] = self.actor_handle._pack_object_ref.remote(chunk)
+
+        array.owned_chunks.add((chunk_position, chunk_shape))
+
+        if len(array_timestep.local_chunks) == nb_chunks_of_node:
+            if not array.is_registered:
+                # Register the array with the head node
+                await self.head.set_owned_chunks.options(enable_task_events=False).remote(
+                    self.actor_id,
+                    array_name,
+                    dtype,
+                    nb_chunks_per_dim,
+                    list(array.owned_chunks),
+                )
+                array.is_registered = True
 
-        if len(chunks_info) == nb_chunks_of_node:
             chunks = []
-            for info in chunks_info:
-                c = self.local_chunks[(info.array_name, info.timestep, info.position)]
+            for position, size in array.owned_chunks:
+                c = array_timestep.local_chunks[position]
                 assert isinstance(c, ray.ObjectRef)
                 chunks.append(c)
-                self.local_chunks[(info.array_name, info.timestep, info.position)] = pickle.dumps(c)
+                array_timestep.local_chunks[position] = pickle.dumps(c)
 
             all_chunks_ref = ray.put(chunks)
 
             await self.head.chunks_ready.options(enable_task_events=False).remote(
-                chunks_info, self.actor_id, [all_chunks_ref]
+                array_name, timestep, [all_chunks_ref]
             )
-            self.chunks_info[array_name] = []
-            self.chunks_ready_event.set()
-            self.chunks_ready_event.clear()
+
+            array_timestep.chunks_ready_event.set()
+            array_timestep.chunks_ready_event.clear()
         else:
-            await self.chunks_ready_event.wait()
+            await array_timestep.chunks_ready_event.wait()
 
     def store_graph(self, graph_id: int, dsk: dict) -> None:
         """
@@ -228,7 +233,7 @@ async def schedule_graph(self, graph_id: int):
             if isinstance(val, ChunkRef):
                 assert val.actor_id == self.actor_id
 
-                encoded_ref = self.local_chunks[(val.array_name, val.timestep, val.position)]
+                encoded_ref = self.arrays[val.array_name].timesteps[val.timestep].local_chunks[val.position]
                 assert isinstance(encoded_ref, bytes)
                 dsk[key] = pickle.loads(encoded_ref)
 
diff --git a/doreisa/head_node.py b/doreisa/head_node.py
@@ -14,7 +14,7 @@
 
 from doreisa import Timestep
 from doreisa._scheduler import doreisa_get
-from doreisa._scheduling_actor import ChunkReadyInfo, ChunkRef, SchedulingActor
+from doreisa._scheduling_actor import ChunkRef, SchedulingActor
 
 
 def init():
@@ -39,9 +39,8 @@ class _DaskArrayData:
     Information about a Dask array being built.
     """
 
-    def __init__(self, definition: ArrayDefinition, timestep: Timestep) -> None:
+    def __init__(self, definition: ArrayDefinition) -> None:
         self.definition = definition
-        self.timestep = timestep
 
         # This will be set when the first chunk is added
         self.nb_chunks_per_dim: tuple[int, ...] | None = None
@@ -56,28 +55,25 @@ def __init__(self, definition: ArrayDefinition, timestep: Timestep) -> None:
         # ID of the scheduling actor in charge of the chunk at each position
         self.scheduling_actors_id: dict[tuple[int, ...], int] = {}
 
+        # Number of scheduling actors owning chunks of this array.
+        self.nb_scheduling_actors: int | None = None
+
         # Each reference comes from one scheduling actor. The reference a list of
         # ObjectRefs, each ObjectRef corresponding to a chunk. These references
         # shouldn't be used directly. They exists only to release the memory
         # automatically.
         # When the array is buit, these references are put in the object store, and the
         # global reference is added to the Dask graph. Then, the list is cleared.
-        self.chunk_refs: list[ray.ObjectRef] = []
+        self.chunk_refs: dict[Timestep, list[ray.ObjectRef]] = {}
 
-    def add_chunk(
+    def set_chunk_owner(
         self,
-        size: tuple[int, ...],
-        position: tuple[int, ...],
-        dtype: np.dtype,
         nb_chunks_per_dim: tuple[int, ...],
+        dtype: np.dtype,
+        position: tuple[int, ...],
+        size: tuple[int, ...],
         scheduling_actor_id: int,
-    ) -> bool:
-        """
-        Add a chunk to the array.
-
-        Return:
-            True if the array is ready, False otherwise.
-        """
+    ) -> None:
         if self.nb_chunks_per_dim is None:
             self.nb_chunks_per_dim = nb_chunks_per_dim
             self.nb_chunks = math.prod(nb_chunks_per_dim)
@@ -100,31 +96,43 @@ def add_chunk(
             else:
                 assert self.chunks_size[d][position[d]] == size[d]
 
-        if len(self.scheduling_actors_id) == self.nb_chunks:  # The array is ready
-            return True
-        return False
+    def add_chunk_ref(self, chunk_ref: ray.ObjectRef, timestep: Timestep) -> bool:
+        """
+        Add a reference sent by a scheduling actor.
+
+        Return:
+            True if all the chunks for this timestep are ready, False otherwise.
+        """
+        self.chunk_refs[timestep].append(chunk_ref)
 
-    def add_chunk_ref(self, chunk_ref: ray.ObjectRef) -> None:
-        self.chunk_refs.append(chunk_ref)
+        # We don't know all the owners yet
+        if len(self.scheduling_actors_id) != self.nb_chunks:
+            return False
 
-    def get_full_array(self) -> da.Array:
+        if self.nb_scheduling_actors is None:
+            self.nb_scheduling_actors = len(set(self.scheduling_actors_id.values()))
+
+        return len(self.chunk_refs[timestep]) == self.nb_scheduling_actors
+
+    def get_full_array(self, timestep: Timestep) -> da.Array:
         """
         Return the full Dask array.
         """
         assert len(self.scheduling_actors_id) == self.nb_chunks
         assert self.nb_chunks is not None and self.nb_chunks_per_dim is not None
 
-        all_chunks = ray.put(self.chunk_refs)
+        all_chunks = ray.put(self.chunk_refs[timestep])
+        del self.chunk_refs[timestep]
 
         # We need to add the timestep since the same name can be used several times for different
         # timesteps
-        dask_name = f"{self.definition.name}_{self.timestep}"
+        dask_name = f"{self.definition.name}_{timestep}"
 
         graph = {
             # We need to repeat the name and position in the value since the key might be removed
             # by the Dask optimizer
             (dask_name,) + position: ChunkRef(
-                actor_id, self.definition.name, self.timestep, position, _all_chunks=all_chunks if it == 0 else None
+                actor_id, self.definition.name, timestep, position, _all_chunks=all_chunks if it == 0 else None
             )
             for it, (position, actor_id) in enumerate(self.scheduling_actors_id.items())
         }
@@ -177,21 +185,18 @@ def __init__(self, arrays_definitions: list[ArrayDefinition], max_pending_arrays
         # For each ID of a simulation node, the corresponding scheduling actor
         self.scheduling_actors: dict[str, ray.actor.ActorHandle] = {}
 
-        self.arrays_definition: dict[str, ArrayDefinition] = {
-            definition.name: definition for definition in arrays_definitions
-        }
-
-        # Must be used before creating a new array
+        # Must be used before creating a new array, to prevent the simulation from being
+        # too many iterations in advance of the analytics.
         self.new_pending_array_semaphore = asyncio.Semaphore(max_pending_arrays)
 
-        # Triggered when a new array is added to self.arrays
         self.new_array_created = asyncio.Event()
 
-        # Arrays beeing built
-        self.arrays: dict[tuple[str, Timestep], _DaskArrayData] = {}
+        self.arrays: dict[str, _DaskArrayData] = {
+            definition.name: _DaskArrayData(definition) for definition in arrays_definitions
+        }
 
         # All the newly created arrays
-        self.arrays_ready: asyncio.Queue[tuple[str, int, da.Array]] = asyncio.Queue()
+        self.arrays_ready: asyncio.Queue[tuple[str, Timestep, da.Array]] = asyncio.Queue()
 
     def list_scheduling_actors(self) -> list[ray.actor.ActorHandle]:
         """
@@ -233,11 +238,22 @@ def preprocessing_callbacks(self) -> dict[str, Callable]:
         """
         Return the preprocessing callbacks for each array.
         """
-        return {name: definition.preprocess for name, definition in self.arrays_definition.items()}
+        return {name: array.definition.preprocess for name, array in self.arrays.items()}
 
-    async def chunks_ready(
-        self, chunks: list[ChunkReadyInfo], scheduling_actor_id: int, all_chunks_ref: list[ray.ObjectRef]
-    ) -> None:
+    def set_owned_chunks(
+        self,
+        scheduling_actor_id: int,
+        array_name: str,
+        dtype: np.dtype,
+        nb_chunks_per_dim: tuple[int, ...],
+        chunks: list[tuple[tuple[int, ...], tuple[int, ...]]],  # [(chunk position, chunk size), ...]
+    ):
+        array = self.arrays[array_name]
+
+        for position, size in chunks:
+            array.set_chunk_owner(nb_chunks_per_dim, dtype, position, size, scheduling_actor_id)
+
+    async def chunks_ready(self, array_name: str, timestep: Timestep, all_chunks_ref: list[ray.ObjectRef]) -> None:
         """
         Called by the scheduling actors to inform the head actor that the chunks are ready.
         The chunks are not sent.
@@ -246,49 +262,39 @@ async def chunks_ready(
             chunks: Information about the chunks that are ready.
             source_actor: Handle to the scheduling actor owning the chunks.
         """
-        for it, chunk in enumerate(chunks):
-            while (chunk.array_name, chunk.timestep) not in self.arrays:
-                t1 = asyncio.create_task(self.new_pending_array_semaphore.acquire())
-                t2 = asyncio.create_task(self.new_array_created.wait())
+        array = self.arrays[array_name]
 
-                done, pending = await asyncio.wait([t1, t2], return_when=asyncio.FIRST_COMPLETED)
+        while timestep not in array.chunk_refs:
+            t1 = asyncio.create_task(self.new_pending_array_semaphore.acquire())
+            t2 = asyncio.create_task(self.new_array_created.wait())
 
-                for task in pending:
-                    task.cancel()
+            done, pending = await asyncio.wait([t1, t2], return_when=asyncio.FIRST_COMPLETED)
 
-                if t1 in done:
-                    if (chunk.array_name, chunk.timestep) in self.arrays:
-                        # The array was already created by another scheduling actor
-                        self.new_pending_array_semaphore.release()
-                    else:
-                        self.arrays[(chunk.array_name, chunk.timestep)] = _DaskArrayData(
-                            self.arrays_definition[chunk.array_name], chunk.timestep
-                        )
+            for task in pending:
+                task.cancel()
 
-                        self.new_array_created.set()
-                        self.new_array_created.clear()
+            if t1 in done:
+                if timestep in array.chunk_refs:
+                    # The array was already created by another scheduling actor
+                    self.new_pending_array_semaphore.release()
+                else:
+                    array.chunk_refs[timestep] = []
 
-            array = self.arrays[(chunk.array_name, chunk.timestep)]
+                    self.new_array_created.set()
+                    self.new_array_created.clear()
 
-            # TODO refactor so that the function works with only one array
-            if it == 0:
-                array.add_chunk_ref(all_chunks_ref[0])
+        is_ready = array.add_chunk_ref(all_chunks_ref[0], timestep)
 
-            is_ready = array.add_chunk(
-                chunk.size, chunk.position, chunk.dtype, chunk.nb_chunks_per_dim, scheduling_actor_id
-            )
-
-            if is_ready:
-                self.arrays_ready.put_nowait(
-                    (
-                        chunk.array_name,
-                        array.timestep,
-                        array.get_full_array(),
-                    )
+        if is_ready:
+            self.arrays_ready.put_nowait(
+                (
+                    array_name,
+                    timestep,
+                    array.get_full_array(timestep),
                 )
-                del self.arrays[(chunk.array_name, chunk.timestep)]
+            )
 
-    async def get_next_array(self) -> tuple[str, int, da.Array]:
+    async def get_next_array(self) -> tuple[str, Timestep, da.Array]:
         array = await self.arrays_ready.get()
         self.new_pending_array_semaphore.release()
         return array
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "doreisa"
-version = "0.1.6"
+version = "0.2.0"
 description = ""
 authors = [{ name = "Adrien Vannson", email = "adrien.vannson@protonmail.com" }]
 requires-python = ">=3.12"
diff --git a/uv.lock b/uv.lock