Enhancement/async datacollector (#167)

Ben-geo · pre-commit-ci[bot] · web-flow · commit f2387369633a · 2025-08-31T15:36:57.000+02:00
* changed self._frames for async functionality * test timer * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * delay for pg * check for pg delay fix * added frames to flush in docstring * frames to flush type hinds * code quality * agentreporter gets model instead of agents * code quality * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * precommit * code quality * max workers * removed default argument from abstract and added them to concrete * batch collection * tests * test_batch * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * precommit * precommit run * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added local batch save test case * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * use public step method --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/AGENTS.md b/AGENTS.md
@@ -1,6 +1,7 @@
 # Repository Guidelines
 
 ## Project Structure & Module Organization
+
 - `mesa_frames/`: Source package.
   - `abstract/` and `concrete/`: Core APIs and implementations.
   - Key modules: `agents.py`, `agentset.py`, `space.py`, `datacollector.py`, `types_.py`.
@@ -9,6 +10,7 @@
 - `examples/`: Reproducible demo models and performance scripts.
 
 ## Build, Test, and Development Commands
+
 - Install (dev stack): `uv sync` (always use uv)
 - Lint & format: `uv run ruff check . --fix && uv run ruff format .`
 - Tests (quiet + coverage): `export MESA_FRAMES_RUNTIME_TYPECHECKING = 1 && uv run pytest -q --cov=mesa_frames --cov-report=term-missing`
@@ -18,23 +20,27 @@
 Always run tools via uv: `uv run <command>`.
 
 ## Coding Style & Naming Conventions
+
 - Python 3.11+, 4-space indent, type hints required for public APIs.
 - Docstrings: NumPy style (validated by Ruff/pydoclint).
 - Formatting/linting: Ruff (formatter + lints). Fix on save if your IDE supports it.
 - Names: `CamelCase` for classes, `snake_case` for functions/attributes, tests as `test_<unit>.py` with `Test<Class>` groups.
 
 ## Testing Guidelines
+
 - Framework: Pytest; place tests under `tests/` mirroring module paths.
 - Conventions: One test module per feature; name tests `test_<method_or_behavior>`.
 - Coverage: Aim to exercise new branches and error paths; keep `--cov=mesa_frames` green.
 - Run fast locally: `pytest -q` or `uv run pytest -q`.
 
 ## Commit & Pull Request Guidelines
+
 - Commits: Imperative mood, concise subject, meaningful body when needed.
   Example: `Fix AgentsDF.sets copy binding and tests`.
 - PRs: Link issues, summarize changes, note API impacts, add/adjust tests and docs.
 - CI hygiene: Run `ruff`, `pytest`, and `pre-commit` locally before pushing.
 
 ## Security & Configuration Tips
+
 - Never commit secrets; use env vars. Example: `MESA_FRAMES_RUNTIME_TYPECHECKING=1` for stricter dev runs.
 - Treat underscored attributes as internal.
diff --git a/mesa_frames/abstract/datacollector.py b/mesa_frames/abstract/datacollector.py
@@ -45,10 +45,12 @@ def flush(self):
 """
 
 from abc import ABC, abstractmethod
-from typing import Dict, Optional, Union, Any, Literal, List
+from typing import Any, Literal
 from collections.abc import Callable
 from mesa_frames import ModelDF
 import polars as pl
+import threading
+from concurrent.futures import ThreadPoolExecutor
 
 
 class AbstractDataCollector(ABC):
@@ -62,21 +64,22 @@ class AbstractDataCollector(ABC):
     _model: ModelDF
     _model_reporters: dict[str, Callable] | None
     _agent_reporters: dict[str, str | Callable] | None
-    _trigger: Callable[..., bool]
+    _trigger: Callable[..., bool] | None
     _reset_memory = bool
     _storage: Literal["memory", "csv", "parquet", "S3-csv", "S3-parquet", "postgresql"]
     _frames: list[pl.DataFrame]
 
     def __init__(
         self,
         model: ModelDF,
-        model_reporters: dict[str, Callable] | None = None,
-        agent_reporters: dict[str, str | Callable] | None = None,
-        trigger: Callable[[Any], bool] | None = None,
-        reset_memory: bool = True,
+        model_reporters: dict[str, Callable] | None,
+        agent_reporters: dict[str, str | Callable] | None,
+        trigger: Callable[[Any], bool] | None,
+        reset_memory: bool,
         storage: Literal[
             "memory", "csv", "parquet", "S3-csv", "S3-parquet", "postgresql"
-        ] = "memory",
+        ],
+        max_workers: int,
     ):
         """
         Initialize a Datacollector.
@@ -95,6 +98,8 @@ def __init__(
             Whether to reset in-memory data after flushing. Default is True.
         storage : Literal["memory", "csv", "parquet", "S3-csv", "S3-parquet", "postgresql"        ]
             Storage backend URI (e.g. 'memory:', 'csv:', 'postgresql:').
+        max_workers : int
+            Maximum number of worker threads used for flushing collected data asynchronously
         """
         self._model = model
         self._model_reporters = model_reporters or {}
@@ -103,6 +108,8 @@ def __init__(
         self._reset_memory = reset_memory
         self._storage = storage or "memory"
         self._frames = []
+        self._lock = threading.Lock()
+        self._executor = ThreadPoolExecutor(max_workers=max_workers)
 
     def collect(self) -> None:
         """
@@ -177,9 +184,12 @@ def flush(self) -> None:
         >>> datacollector.flush()
         >>> # Data is saved externally and in-memory buffers are cleared if configured
         """
-        self._flush()
-        if self._reset_memory:
-            self._reset()
+        with self._lock:
+            frames_to_flush = self._frames
+            if self._reset_memory:
+                self._reset()
+
+        self._executor.submit(self._flush, frames_to_flush)
 
     def _reset(self):
         """
diff --git a/mesa_frames/concrete/datacollector.py b/mesa_frames/concrete/datacollector.py
@@ -79,6 +79,7 @@ def __init__(
         ] = "memory",
         storage_uri: str | None = None,
         schema: str = "public",
+        max_worker: int = 4,
     ):
         """
         Initialize the DataCollector with configuration options.
@@ -101,15 +102,17 @@ def __init__(
             URI or path corresponding to the selected storage backend.
         schema: str
             Schema name used for PostgreSQL storage.
-
+        max_worker : int
+            Maximum number of worker threads used for flushing collected data asynchronously
         """
         super().__init__(
             model=model,
             model_reporters=model_reporters,
             agent_reporters=agent_reporters,
             trigger=trigger,
             reset_memory=reset_memory,
-            storage=storage,  # literal won't work
+            storage=storage,
+            max_workers=max_worker,
         )
         self._writers = {
             "csv": self._write_csv_local,
@@ -120,6 +123,8 @@ def __init__(
         }
         self._storage_uri = storage_uri
         self._schema = schema
+        self._current_model_step = None
+        self._batch_id = None
 
         self._validate_inputs()
 
@@ -130,28 +135,42 @@ def _collect(self):
         This method checks for the presence of model and agent reporters
         and calls the appropriate collection routines for each.
         """
+        if (
+            self._current_model_step is None
+            or self._current_model_step != self._model.steps
+        ):
+            self._current_model_step = self._model.steps
+            self._batch_id = 0
+
         if self._model_reporters:
-            self._collect_model_reporters()
+            self._collect_model_reporters(
+                current_model_step=self._current_model_step, batch_id=self._batch_id
+            )
 
         if self._agent_reporters:
-            self._collect_agent_reporters()
+            self._collect_agent_reporters(
+                current_model_step=self._current_model_step, batch_id=self._batch_id
+            )
+
+        self._batch_id += 1
 
-    def _collect_model_reporters(self):
+    def _collect_model_reporters(self, current_model_step: int, batch_id: int):
         """
         Collect model-level data using the model_reporters.
 
         Creates a LazyFrame containing the step, seed, and values
         returned by each model reporter. Appends the LazyFrame to internal storage.
         """
         model_data_dict = {}
-        model_data_dict["step"] = self._model._steps
+        model_data_dict["step"] = current_model_step
         model_data_dict["seed"] = str(self.seed)
+        model_data_dict["batch"] = batch_id
         for column_name, reporter in self._model_reporters.items():
             model_data_dict[column_name] = reporter(self._model)
         model_lazy_frame = pl.LazyFrame([model_data_dict])
-        self._frames.append(("model", str(self._model._steps), model_lazy_frame))
+        self._frames.append(("model", current_model_step, batch_id, model_lazy_frame))
 
-    def _collect_agent_reporters(self):
+    def _collect_agent_reporters(self, current_model_step: int, batch_id: int):
         """
         Collect agent-level data using the agent_reporters.
 
@@ -164,15 +183,16 @@ def _collect_agent_reporters(self):
                 for k, v in self._model.agents[reporter].items():
                     agent_data_dict[col_name + "_" + str(k.__class__.__name__)] = v
             else:
-                agent_data_dict[col_name] = reporter(self._model.agents)
+                agent_data_dict[col_name] = reporter(self._model)
         agent_lazy_frame = pl.LazyFrame(agent_data_dict)
         agent_lazy_frame = agent_lazy_frame.with_columns(
             [
-                pl.lit(self._model._steps).alias("step"),
+                pl.lit(current_model_step).alias("step"),
                 pl.lit(str(self.seed)).alias("seed"),
+                pl.lit(batch_id).alias("batch"),
             ]
         )
-        self._frames.append(("agent", str(self._model._steps), agent_lazy_frame))
+        self._frames.append(("agent", current_model_step, batch_id, agent_lazy_frame))
 
     @property
     def data(self) -> dict[str, pl.DataFrame]:
@@ -185,96 +205,108 @@ def data(self) -> dict[str, pl.DataFrame]:
             A dictionary with keys "model" and "agent" mapping to concatenated DataFrames of collected data.
         """
         model_frames = [
-            lf.collect() for kind, step, lf in self._frames if kind == "model"
+            lf.collect() for kind, step, batch_id, lf in self._frames if kind == "model"
         ]
         agent_frames = [
-            lf.collect() for kind, step, lf in self._frames if kind == "agent"
+            lf.collect() for kind, step, batch_id, lf in self._frames if kind == "agent"
         ]
         return {
             "model": pl.concat(model_frames) if model_frames else pl.DataFrame(),
             "agent": pl.concat(agent_frames) if agent_frames else pl.DataFrame(),
         }
 
-    def _flush(self):
+    def _flush(self, frames_to_flush: list):
         """
         Flush the collected data to the configured external storage backend.
 
         Uses the appropriate writer function based on the specified storage option.
         """
-        self._writers[self._storage](self._storage_uri)
+        self._writers[self._storage](
+            uri=self._storage_uri, frames_to_flush=frames_to_flush
+        )
 
-    def _write_csv_local(self, uri: str):
+    def _write_csv_local(self, uri: str, frames_to_flush: list):
         """
         Write collected data to local CSV files.
 
         Parameters
         ----------
         uri : str
             Local directory path to write files into.
+        frames_to_flush : list
+            the collected data in the current thread.
         """
-        for kind, step, df in self._frames:
-            df.collect().write_csv(f"{uri}/{kind}_step{step}.csv")
+        for kind, step, batch, df in frames_to_flush:
+            df.collect().write_csv(f"{uri}/{kind}_step{step}_batch{batch}.csv")
 
-    def _write_parquet_local(self, uri: str):
+    def _write_parquet_local(self, uri: str, frames_to_flush: list):
         """
         Write collected data to local Parquet files.
 
         Parameters
         ----------
         uri: str
             Local directory path to write files into.
+        frames_to_flush : list
+            the collected data in the current thread.
         """
-        for kind, step, df in self._frames:
-            df.collect().write_parquet(f"{uri}/{kind}_step{step}.parquet")
+        for kind, step, batch, df in frames_to_flush:
+            df.collect().write_parquet(f"{uri}/{kind}_step{step}_batch{batch}.parquet")
 
-    def _write_csv_s3(self, uri: str):
+    def _write_csv_s3(self, uri: str, frames_to_flush: list):
         """
         Write collected data to AWS S3 in CSV format.
 
         Parameters
         ----------
         uri: str
             S3 URI (e.g., s3://bucket/path) to upload files to.
+        frames_to_flush : list
+            the collected data in the current thread.
         """
-        self._write_s3(uri, format_="csv")
+        self._write_s3(uri=uri, frames_to_flush=frames_to_flush, format_="csv")
 
-    def _write_parquet_s3(self, uri: str):
+    def _write_parquet_s3(self, uri: str, frames_to_flush: list):
         """
         Write collected data to AWS S3 in Parquet format.
 
         Parameters
         ----------
         uri: str
             S3 URI (e.g., s3://bucket/path) to upload files to.
+        frames_to_flush : list
+            the collected data in the current thread.
         """
-        self._write_s3(uri, format_="parquet")
+        self._write_s3(uri=uri, frames_to_flush=frames_to_flush, format_="parquet")
 
-    def _write_s3(self, uri: str, format_: str):
+    def _write_s3(self, uri: str, frames_to_flush: list, format_: str):
         """
         Upload collected data to S3 in a specified format.
 
         Parameters
         ----------
         uri: str
             S3 URI to upload to.
+        frames_to_flush : list
+            the collected data in the current thread.
         format_: str
             Format of the output files ("csv" or "parquet").
         """
         s3 = boto3.client("s3")
         parsed = urlparse(uri)
         bucket = parsed.netloc
         prefix = parsed.path.lstrip("/")
-        for kind, step, lf in self._frames:
+        for kind, step, batch, lf in frames_to_flush:
             df = lf.collect()
             with tempfile.NamedTemporaryFile(suffix=f".{format_}") as tmp:
                 if format_ == "csv":
                     df.write_csv(tmp.name)
                 elif format_ == "parquet":
                     df.write_parquet(tmp.name)
-                key = f"{prefix}/{kind}_step{step}.{format_}"
+                key = f"{prefix}/{kind}_step{step}_batch{batch}.{format_}"
                 s3.upload_file(tmp.name, bucket, key)
 
-    def _write_postgres(self, uri: str):
+    def _write_postgres(self, uri: str, frames_to_flush: list):
         """
         Write collected data to a PostgreSQL database.
 
@@ -285,10 +317,12 @@ def _write_postgres(self, uri: str):
         ----------
         uri: str
             PostgreSQL connection URI in the form postgresql://testuser:testpass@localhost:5432/testdb
+        frames_to_flush : list
+            the collected data in the current thread.
         """
         conn = self._get_db_connection(uri=uri)
         cur = conn.cursor()
-        for kind, step, lf in self._frames:
+        for kind, step, batch, lf in frames_to_flush:
             df = lf.collect()
             table = f"{kind}_data"
             cols = df.columns
diff --git a/tests/test_datacollector.py b/tests/test_datacollector.py