From 421d4c345c4dd0e3a48c3019e95b820bd9d9f074 Mon Sep 17 00:00:00 2001
From: Suraj Panickar <panickarsuraj.1@gmail.com>
Date: Sun, 1 Mar 2026 11:10:46 +0530
Subject: [PATCH 1/6] fix: store graded EpisodicMemory entries as MemoryEntry
 objects and use correct LLM instance

---
 mesa_llm/memory/episodic_memory.py        | 33 ++++++++++++++++++-----
 tests/test_memory/test_episodic_memory.py | 16 ++++++++---
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/mesa_llm/memory/episodic_memory.py b/mesa_llm/memory/episodic_memory.py
index d473cc6b..a635a5f3 100644
--- a/mesa_llm/memory/episodic_memory.py
+++ b/mesa_llm/memory/episodic_memory.py
@@ -89,7 +89,7 @@ def grade_event_importance(self, type: str, content: dict) -> float:
         prompt = self._build_grade_prompt(type, content)
         self.llm.system_prompt = self.system_prompt
 
-        rsp = self.agent.llm.generate(
+        rsp = self.llm.generate(
             prompt=prompt,
             response_format=EventGrade,
         )
@@ -104,7 +104,7 @@ async def agrade_event_importance(self, type: str, content: dict) -> float:
         prompt = self._build_grade_prompt(type, content)
         self.llm.system_prompt = self.system_prompt
 
-        rsp = await self.agent.llm.agenerate(
+        rsp = await self.llm.agenerate(
             prompt=prompt,
             response_format=EventGrade,
         )
@@ -126,17 +126,38 @@ def retrieve_top_k_entries(self, k: int) -> list[MemoryEntry]:
 
     def add_to_memory(self, type: str, content: dict):
         """
-        Add a new memory entry to the memory
+        grading logic + Add a new memory entry to the memory
         """
-        content["importance"] = self.grade_event_importance(type, content)
+        graded_content = {
+            **content,
+            "importance": self.grade_event_importance(type, content),
+        }
+
+        new_entry = MemoryEntry(
+            agent=self.agent,
+            content={type: graded_content},
+            step=self.agent.model.steps,
+        )
+        self.memory_entries.append(new_entry)
 
         super().add_to_memory(type, content)
 
     async def aadd_to_memory(self, type: str, content: dict):
         """
-        Async version of add_to_memory
+        Async version of add_to_memory + grading logic
         """
-        content["importance"] = await self.agrade_event_importance(type, content)
+        graded_content = {
+            **content,
+            "importance": await self.agrade_event_importance(type, content),
+        }
+
+        new_entry = MemoryEntry(
+            agent=self.agent,
+            content={type: graded_content},
+            step=self.agent.model.steps,
+        )
+        self.memory_entries.append(new_entry)
+
         super().add_to_memory(type, content)
 
     def get_prompt_ready(self) -> str:
diff --git a/tests/test_memory/test_episodic_memory.py b/tests/test_memory/test_episodic_memory.py
index c8f91f69..f4fcd053 100644
--- a/tests/test_memory/test_episodic_memory.py
+++ b/tests/test_memory/test_episodic_memory.py
@@ -51,6 +51,10 @@ def test_add_memory_entry(self, mock_agent):
         """Test adding memories to Episodic memory"""
         memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
 
+        mock_response = MagicMock()
+        mock_response.choices[0].message.content = json.dumps({"grade": 3})
+        memory.llm.generate = MagicMock(return_value=mock_response)
+
         # Test basic addition with observation
         memory.add_to_memory("observation", {"step": 1, "content": "Test content"})
 
@@ -62,6 +66,9 @@ def test_add_memory_entry(self, mock_agent):
 
         # Should be empty step_content initially
         assert memory.step_content != {}
+        assert len(memory.memory_entries) == 3, (
+            "add_to_memory graded the event but never created a MemoryEntry"
+        )
 
     def test_grade_event_importance(self, mock_agent):
         """Test grading event importance"""
@@ -70,7 +77,7 @@ def test_grade_event_importance(self, mock_agent):
         # 1. Set up a specific grade for this test
         mock_response = MagicMock()
         mock_response.choices[0].message.content = json.dumps({"grade": 5})
-        mock_agent.llm.generate.return_value = mock_response
+        memory.llm.generate = MagicMock(return_value=mock_response)
 
         # 2. Call the method
         grade = memory.grade_event_importance("observation", {"data": "critical info"})
@@ -79,7 +86,7 @@ def test_grade_event_importance(self, mock_agent):
         assert grade == 5
 
         # 4. Assert the LLM was called correctly
-        mock_agent.llm.generate.assert_called_once()
+        memory.llm.generate.assert_called_once()
 
         # Check that the system prompt was set on the llm object
         assert memory.llm.system_prompt == memory.system_prompt
@@ -192,7 +199,7 @@ async def test_async_add_memory_entry(self, mock_agent):
         ]
 
         # Assigns the mock response
-        mock_agent.llm.agenerate = AsyncMock(return_value=mock_response)
+        memory.llm.agenerate = AsyncMock(return_value=mock_response)
 
         # adds content into the memory using the async counter part of add_to_memory function
         await memory.aadd_to_memory("observation", {"content": "Test content"})
@@ -201,6 +208,9 @@ async def test_async_add_memory_entry(self, mock_agent):
 
         # checks to ensure that step content is not empty
         assert memory.step_content != {}
+        assert len(memory.memory_entries) == 3, (
+            "aadd_to_memory graded the event but never created a MemoryEntry"
+        )
 
     def test_build_grade_prompt_no_previous_entries(self, mock_agent):
         """

From d3db689258bd82bb504fbfa2dcc19b9efc0cfb33 Mon Sep 17 00:00:00 2001
From: Suraj Panickar <panickarsuraj.1@gmail.com>
Date: Mon, 2 Mar 2026 22:04:25 +0530
Subject: [PATCH 2/6] Refactored sync/async add to memory functions and updated
 tests.

---
 mesa_llm/memory/episodic_memory.py        | 32 +++++++++--------------
 tests/test_memory/test_episodic_memory.py | 23 ++++++++++++++++
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/mesa_llm/memory/episodic_memory.py b/mesa_llm/memory/episodic_memory.py
index a635a5f3..1c2dc7db 100644
--- a/mesa_llm/memory/episodic_memory.py
+++ b/mesa_llm/memory/episodic_memory.py
@@ -124,23 +124,25 @@ def retrieve_top_k_entries(self, k: int) -> list[MemoryEntry]:
 
         return top_list[:k]
 
-    def add_to_memory(self, type: str, content: dict):
-        """
-        grading logic + Add a new memory entry to the memory
-        """
-        graded_content = {
-            **content,
-            "importance": self.grade_event_importance(type, content),
-        }
-
+    def _finalize_entry(self, type: str, graded_content: dict):
+        """Shared function for both sync/async add to memory which helps to create memory entry and stores it into a base class."""
         new_entry = MemoryEntry(
             agent=self.agent,
             content={type: graded_content},
             step=self.agent.model.steps,
         )
         self.memory_entries.append(new_entry)
+        super().add_to_memory(type, graded_content)
 
-        super().add_to_memory(type, content)
+    def add_to_memory(self, type: str, content: dict):
+        """
+        grading logic + adding to memory function call
+        """
+        graded_content = {
+            **content,
+            "importance": self.grade_event_importance(type, content),
+        }
+        self._finalize_entry(type, graded_content)
 
     async def aadd_to_memory(self, type: str, content: dict):
         """
@@ -150,15 +152,7 @@ async def aadd_to_memory(self, type: str, content: dict):
             **content,
             "importance": await self.agrade_event_importance(type, content),
         }
-
-        new_entry = MemoryEntry(
-            agent=self.agent,
-            content={type: graded_content},
-            step=self.agent.model.steps,
-        )
-        self.memory_entries.append(new_entry)
-
-        super().add_to_memory(type, content)
+        self._finalize_entry(type, graded_content)
 
     def get_prompt_ready(self) -> str:
         return f"Top {self.considered_entries} memory entries:\n\n" + "\n".join(
diff --git a/tests/test_memory/test_episodic_memory.py b/tests/test_memory/test_episodic_memory.py
index f4fcd053..d7300b40 100644
--- a/tests/test_memory/test_episodic_memory.py
+++ b/tests/test_memory/test_episodic_memory.py
@@ -70,6 +70,24 @@ def test_add_memory_entry(self, mock_agent):
             "add_to_memory graded the event but never created a MemoryEntry"
         )
 
+    def test_finalize_entry_consistency(self, mock_agent):
+        """Minimal tests for the helper function _finalize_event_entry().
+        - This test ensures that:
+        - A `MemoryEntry` object is created and stored in episodic memory.
+        - The graded content is forwarded to the base `Memory.step_content` via `super().add_to_memory()` (regression guard).
+        - The stored entry contains the correct importance score.
+        - The entry is stamped with the current agent step.
+        """
+        memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
+        graded_content = {"data": "test", "importance": 4}
+
+        memory._finalize_entry("observation", graded_content)
+
+        assert memory.memory_entries[0].content["observation"]["importance"] == 4
+        assert memory.step_content["observation"]["importance"] == 4
+        assert isinstance(memory.memory_entries[0], MemoryEntry)
+        assert memory.memory_entries[0].step == mock_agent.model.steps
+
     def test_grade_event_importance(self, mock_agent):
         """Test grading event importance"""
         memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
@@ -206,6 +224,11 @@ async def test_async_add_memory_entry(self, mock_agent):
         await memory.aadd_to_memory("planning", {"plan": "Test plan"})
         await memory.aadd_to_memory("action", {"action": "Test action"})
 
+        new_entry = memory.memory_entries[0]
+
+        for new_entry in memory.memory_entries:
+            event_type = next(iter(new_entry.content.keys()))
+            assert new_entry.content[event_type]["importance"] == 3
         # checks to ensure that step content is not empty
         assert memory.step_content != {}
         assert len(memory.memory_entries) == 3, (

From 8451485e423bff7e451c5b3d954c4151c8abf6d1 Mon Sep 17 00:00:00 2001
From: Suraj Panickar <panickarsuraj.1@gmail.com>
Date: Wed, 4 Mar 2026 20:55:46 +0530
Subject: [PATCH 3/6] Improved grading logic(recency + importance) added tests.

---
 mesa_llm/memory/episodic_memory.py        | 81 +++++++++++++++++++++--
 tests/test_memory/test_episodic_memory.py | 81 ++++++++++++++---------
 2 files changed, 125 insertions(+), 37 deletions(-)

diff --git a/mesa_llm/memory/episodic_memory.py b/mesa_llm/memory/episodic_memory.py
index 1c2dc7db..b653c11c 100644
--- a/mesa_llm/memory/episodic_memory.py
+++ b/mesa_llm/memory/episodic_memory.py
@@ -14,6 +14,32 @@ class EventGrade(BaseModel):
     grade: int
 
 
+def normalize_dict_values(scores: dict, min_target: float, max_target: float) -> dict:
+    """
+    normalise the values in the given using min-max scaling to the given range.
+    """
+    if not scores:
+        return {}
+
+    vals = list(scores.values())
+    min_val = min(vals)
+    max_val = max(vals)
+
+    range_val = max_val - min_val
+
+    if range_val == 0:
+        midpoint = (max_target - min_target) / 2 + min_target
+        for key in scores:
+            scores[key] = midpoint
+    else:
+        for key, val in scores.items():
+            scores[key] = (val - min_val) * (
+                max_target - min_target
+            ) / range_val + min_target
+
+    return scores
+
+
 class EpisodicMemory(Memory):
     """
     Stores memories based on event importance scoring. Each new memory entry is evaluated by a LLM
@@ -29,6 +55,7 @@ def __init__(
         display: bool = True,
         max_capacity: int = 10,
         considered_entries: int = 5,
+        recency_decay: float = 0.995,
     ):
         """
         Initialize the EpisodicMemory
@@ -43,6 +70,7 @@ def __init__(
         self.max_capacity = max_capacity
         self.memory_entries = deque(maxlen=self.max_capacity)
         self.considered_entries = considered_entries
+        self.recency_decay = recency_decay
 
         self.system_prompt = """
             You are an assistant that evaluates memory entries on a scale from 1 to 5, based on their importance to a specific problem or task. Your goal is to assign a score that reflects how much each entry contributes to understanding, solving, or advancing the task. Use the following grading scale:
@@ -60,6 +88,24 @@ def __init__(
             Only assess based on the entry's content and its value to the task at hand. Ignore style, grammar, or tone.
             """
 
+    def _extract_importance(self, entry) -> int:
+        """
+        Safely extracts importance score regardless of data structure.
+        Handles:
+        - Nested: {"msg": {"importance": 5}}
+        - Flat:   {"importance": 5}
+        """
+        if "importance" in entry.content:
+            val = entry.content["importance"]
+            return val if isinstance(val, (int, float)) else 1
+
+        for value in entry.content.values():
+            if isinstance(value, dict) and "importance" in value:
+                val = value["importance"]
+                return val if isinstance(val, (int, float)) else 1
+
+        return 1
+
     def _build_grade_prompt(self, type: str, content: dict) -> str:
         """
         This helper assembles a prompt that includes the event type, event content,
@@ -114,15 +160,36 @@ async def agrade_event_importance(self, type: str, content: dict) -> float:
 
     def retrieve_top_k_entries(self, k: int) -> list[MemoryEntry]:
         """
-        Retrieve the top k entries based on the importance and recency
+        Retrieve the top k entries based on the importance and recency (Releveance is yet to be added.)
+            - Uses min-max normlaizations to convert both importance and recency inorder to avoid large diffs in values.
+            - Computes total score by adding the normalised importance and recency values.
+            - Returns the list of entries in the final_score list
         """
-        top_list = sorted(
-            self.memory_entries,
-            key=lambda x: x.content["importance"] - (self.agent.model.steps - x.step),
-            reverse=True,
-        )
+        if not self.memory_entries:
+            return []
+
+        importance_dict = {}
+        recency_dict = {}
+
+        entries = list(self.memory_entries)
+        current_step = self.agent.model.steps
+
+        for i, entry in enumerate(entries):
+            importance_dict[i] = self._extract_importance(entry)
+
+            age = current_step - entry.step
+            recency_dict[i] = self.recency_decay**age
+
+        importance_scaled = normalize_dict_values(importance_dict, 0, 1)
+        recency_scaled = normalize_dict_values(recency_dict, 0, 1)
+
+        final_scores = []
+        for i in range(len(entries)):
+            total_score = importance_scaled[i] + recency_scaled[i]
+            final_scores.append((total_score, entries[i]))
 
-        return top_list[:k]
+        final_scores.sort(key=lambda x: x[0], reverse=True)
+        return [entry for _, entry in final_scores[:k]]
 
     def _finalize_entry(self, type: str, graded_content: dict):
         """Shared function for both sync/async add to memory which helps to create memory entry and stores it into a base class."""
diff --git a/tests/test_memory/test_episodic_memory.py b/tests/test_memory/test_episodic_memory.py
index d7300b40..61a988b9 100644
--- a/tests/test_memory/test_episodic_memory.py
+++ b/tests/test_memory/test_episodic_memory.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from mesa_llm.memory.episodic_memory import EpisodicMemory
+from mesa_llm.memory.episodic_memory import EpisodicMemory, normalize_dict_values
 from mesa_llm.memory.memory import MemoryEntry
 
 
@@ -26,6 +26,25 @@ def mock_agent():
     return agent
 
 
+def test_normalize_dict_floats_logic():
+    """
+    Function to check whether the values are normalised properly.
+        - Hardcoded dict values are used currently to ensure that the normalization logic works.
+        - Checks both cases, ie when the range = 0 and when its not 0.
+    """
+    d = {0: 10, 1: 20, 2: 30}
+    norm = normalize_dict_values(d, 0, 1)
+    assert norm[0] == 0.0
+    assert norm[1] == 0.5
+    assert norm[2] == 1.0
+
+    # Checks normalized value when range is 0
+    d_tie = {0: 5, 1: 5}
+    norm_tie = normalize_dict_values(d_tie, 0, 1)
+    assert norm_tie[0] == 0.5
+    assert norm_tie[1] == 0.5
+
+
 class TestEpisodicMemory:
     """Core functionality test"""
 
@@ -109,45 +128,37 @@ def test_grade_event_importance(self, mock_agent):
         # Check that the system prompt was set on the llm object
         assert memory.llm.system_prompt == memory.system_prompt
 
-    def test_retrieve_top_k_entries(self, mock_agent):
-        """Test the sorting logic for retrieving entries (importance - recency_penalty)."""
+    def test_retrieve_top_k_importance_beats_recency(self, mock_agent):
+        """
+        Function Verify that a highly important but older memory can outrank
+        a recent but low-importance memory after normalization.
+        """
+
         memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
-        # Set current step
-        mock_agent.model.steps = 100
 
-        # Manually add entries to bypass grading and control scores
-        # score = importance - (current_step - entry_step)
+        mock_agent.model.steps = 100
 
-        # score = 5 - (100 - 98) = 3
+        # Very important but old
         entry_a = MemoryEntry(
-            content={"importance": 5, "id": "A"}, step=98, agent=mock_agent
-        )
-        # score = 1 - (100 - 99) = 0
-        entry_b = MemoryEntry(
-            content={"importance": 1, "id": "B"}, step=99, agent=mock_agent
+            content={"message": {"importance": 5, "info": "The meaning of life"}},
+            step=80,
+            agent=mock_agent,
         )
-        # score = 4 - (100 - 90) = -6
+
+        # Very recent but unimportant
         entry_c = MemoryEntry(
-            content={"importance": 4, "id": "C"}, step=90, agent=mock_agent
-        )
-        # score = 4 - (100 - 95) = -1
-        entry_d = MemoryEntry(
-            content={"importance": 4, "id": "D"}, step=95, agent=mock_agent
+            content={"message": {"importance": 1, "info": "I saw a bird"}},
+            step=99,
+            agent=mock_agent,
         )
 
-        memory.memory_entries.extend([entry_a, entry_b, entry_c, entry_d])
+        memory.memory_entries.extend([entry_a, entry_c])
 
-        # Retrieve top 3 (k=3)
-        top_entries = memory.retrieve_top_k_entries(3)
+        top_entries = memory.retrieve_top_k_entries(1)
 
-        # Expected order: A (3), B (0), D (-1)
-        assert len(top_entries) == 3
-        assert top_entries[0].content["id"] == "A"
-        assert top_entries[1].content["id"] == "B"
-        assert top_entries[2].content["id"] == "D"
-
-        # Entry C (score -6) should be omitted
-        assert "C" not in [e.content["id"] for e in top_entries]
+        # The highly important memory should win
+        assert len(top_entries) == 1
+        assert top_entries[0] == entry_a
 
     def test_process_step_pre_step(self, mock_agent):
         """
@@ -290,3 +301,13 @@ def test_get_communication_history(self, mock_agent):
         assert (
             "No message here" not in history
         )  # step 2  does not have message field thus it must not be present in the returned string
+
+    def test_retrieve_empty_memory(self, mock_agent):
+        """
+        Function to verify empty list is returned when retrieval of memory is empty
+        """
+        memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
+
+        result = memory.retrieve_top_k_entries(3)
+
+        assert result == []

From 35995ec6ad713c5406faf7c35ea6b061b0003bd7 Mon Sep 17 00:00:00 2001
From: Suraj Panickar <panickarsuraj.1@gmail.com>
Date: Thu, 5 Mar 2026 01:20:58 +0530
Subject: [PATCH 4/6] Imporved codecov coverage.

---
 tests/test_memory/test_episodic_memory.py | 44 +++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tests/test_memory/test_episodic_memory.py b/tests/test_memory/test_episodic_memory.py
index 61a988b9..29237ca6 100644
--- a/tests/test_memory/test_episodic_memory.py
+++ b/tests/test_memory/test_episodic_memory.py
@@ -45,6 +45,14 @@ def test_normalize_dict_floats_logic():
     assert norm_tie[1] == 0.5
 
 
+def test_normalize_dict_floats_logic_when_empty():
+    """
+    Function to check whether normalize_dict_values correctly returns an empty dict
+    """
+    norm = normalize_dict_values({}, 0, 1)
+    assert norm == {}
+
+
 class TestEpisodicMemory:
     """Core functionality test"""
 
@@ -311,3 +319,39 @@ def test_retrieve_empty_memory(self, mock_agent):
         result = memory.retrieve_top_k_entries(3)
 
         assert result == []
+
+    def test_extract_importance_flat(self, mock_agent):
+        """Function to return importance when stored at top level"""
+        memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
+        entry = MemoryEntry(
+            content={"importance": 5, "message": "hello"},
+            step=1,
+            agent=mock_agent,
+        )
+        result = memory._extract_importance(entry)
+        assert result == 5
+
+    def test_extract_importance_nested(self, mock_agent):
+        """Should return importance when nested inside another dict"""
+        memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
+
+        entry = MemoryEntry(
+            content={"message": {"importance": 4, "text": "nested"}},
+            step=1,
+            agent=mock_agent,
+        )
+        result = memory._extract_importance(entry)
+
+        assert result == 4
+
+    def test_extract_importance_missing(self, mock_agent):
+        """Should fallback to 1 when importance is absent"""
+        memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
+
+        entry = MemoryEntry(
+            content={"message": {"text": "no importance"}},
+            step=1,
+            agent=mock_agent,
+        )
+        result = memory._extract_importance(entry)
+        assert result == 1

From ac90308409ccbf127a6ed87ddf84dde31772dd69 Mon Sep 17 00:00:00 2001
From: Wang Boyu <boyu.wby@gmail.com>
Date: Thu, 5 Mar 2026 15:18:55 -0500
Subject: [PATCH 5/6] skip pre/post step processing of memory entries in
 episodic memory

---
 mesa_llm/memory/episodic_memory.py        | 62 +++++++++------
 tests/test_memory/test_episodic_memory.py | 92 +++++++++++++++--------
 2 files changed, 97 insertions(+), 57 deletions(-)

diff --git a/mesa_llm/memory/episodic_memory.py b/mesa_llm/memory/episodic_memory.py
index b653c11c..629c0ba7 100644
--- a/mesa_llm/memory/episodic_memory.py
+++ b/mesa_llm/memory/episodic_memory.py
@@ -16,7 +16,11 @@ class EventGrade(BaseModel):
 
 def normalize_dict_values(scores: dict, min_target: float, max_target: float) -> dict:
     """
-    normalise the values in the given using min-max scaling to the given range.
+    Normalize dictionary values to a target range with min-max scaling.
+
+    This mirrors the min-max helper used in the Generative Agents reference
+    retrieval implementation:
+    https://github.com/joonspk-research/generative_agents/blob/main/reverie/backend_server/persona/cognitive_modules/retrieve.py
     """
     if not scores:
         return {}
@@ -42,10 +46,18 @@ def normalize_dict_values(scores: dict, min_target: float, max_target: float) ->
 
 class EpisodicMemory(Memory):
     """
-    Stores memories based on event importance scoring. Each new memory entry is evaluated by a LLM
-    for its relevance and importance (1-5 scale) relative to the agent's current task and previous
-    experiences. Based on a Stanford/DeepMind paper:
-    [Generative Agents: Interactive Simulacra of Human Behavior](https://arxiv.org/pdf/2304.03442)
+    Event-level memory with LLM-based importance scoring and recency-aware retrieval.
+
+    Credit / references:
+    - Paper: Generative Agents: Interactive Simulacra of Human Behavior
+      https://arxiv.org/abs/2304.03442
+    - Reference retrieval code:
+      https://github.com/joonspk-research/generative_agents/blob/main/reverie/backend_server/persona/cognitive_modules/retrieve.py
+
+    This implementation is inspired by the paper's retrieval scoring design
+    (component-wise min-max normalization, then weighted combination). It is
+    not a strict copy of the original code: relevance scoring via embeddings is
+    not implemented yet, and recency is computed from step age.
     """
 
     def __init__(
@@ -53,8 +65,8 @@ def __init__(
         agent: "LLMAgent",
         llm_model: str | None = None,
         display: bool = True,
-        max_capacity: int = 10,
-        considered_entries: int = 5,
+        max_capacity: int = 200,
+        considered_entries: int = 30,
         recency_decay: float = 0.995,
     ):
         """
@@ -160,10 +172,13 @@ async def agrade_event_importance(self, type: str, content: dict) -> float:
 
     def retrieve_top_k_entries(self, k: int) -> list[MemoryEntry]:
         """
-        Retrieve the top k entries based on the importance and recency (Releveance is yet to be added.)
-            - Uses min-max normlaizations to convert both importance and recency inorder to avoid large diffs in values.
-            - Computes total score by adding the normalised importance and recency values.
-            - Returns the list of entries in the final_score list
+        Retrieve the top-k entries using normalized importance and recency.
+
+        Notes:
+        - Inspired by Generative Agents retrieval scoring:
+          recency/importance/relevance are normalized separately and combined.
+        - This implementation currently combines importance + recency only.
+          Relevance (embedding cosine similarity with a focal query) is pending.
         """
         if not self.memory_entries:
             return []
@@ -192,14 +207,13 @@ def retrieve_top_k_entries(self, k: int) -> list[MemoryEntry]:
         return [entry for _, entry in final_scores[:k]]
 
     def _finalize_entry(self, type: str, graded_content: dict):
-        """Shared function for both sync/async add to memory which helps to create memory entry and stores it into a base class."""
+        """Create and persist a finalized episodic entry."""
         new_entry = MemoryEntry(
             agent=self.agent,
             content={type: graded_content},
             step=self.agent.model.steps,
         )
         self.memory_entries.append(new_entry)
-        super().add_to_memory(type, graded_content)
 
     def add_to_memory(self, type: str, content: dict):
         """
@@ -243,20 +257,18 @@ def get_communication_history(self) -> str:
 
     async def aprocess_step(self, pre_step: bool = False):
         """
-        Asynchronous version of process_step
+        Asynchronous version of process_step.
+
+        EpisodicMemory persists entries at add-time and does not use two-phase
+        pre/post-step buffering.
         """
-        if pre_step:
-            await self.aadd_to_memory(type="observation", content=self.step_content)
-            self.step_content = {}
-            return
+        return
 
     def process_step(self, pre_step: bool = False):
         """
-        Process the step of the agent :
-        - Add the new entry to the memory
-        - Display the new entry
+        Process step hook (no-op for episodic memory).
+
+        EpisodicMemory persists entries at add-time and does not use two-phase
+        pre/post-step buffering.
         """
-        if pre_step:
-            self.add_to_memory(type="observation", content=self.step_content)
-            self.step_content = {}
-            return
+        return
diff --git a/tests/test_memory/test_episodic_memory.py b/tests/test_memory/test_episodic_memory.py
index 29237ca6..ad841fee 100644
--- a/tests/test_memory/test_episodic_memory.py
+++ b/tests/test_memory/test_episodic_memory.py
@@ -91,17 +91,16 @@ def test_add_memory_entry(self, mock_agent):
         # Test with action
         memory.add_to_memory("action", {"action": "Test action"})
 
-        # Should be empty step_content initially
-        assert memory.step_content != {}
+        # EpisodicMemory should not rely on transient step buffers.
+        assert memory.step_content == {}
         assert len(memory.memory_entries) == 3, (
             "add_to_memory graded the event but never created a MemoryEntry"
         )
 
     def test_finalize_entry_consistency(self, mock_agent):
-        """Minimal tests for the helper function _finalize_event_entry().
+        """Minimal tests for the helper function _finalize_entry().
         - This test ensures that:
         - A `MemoryEntry` object is created and stored in episodic memory.
-        - The graded content is forwarded to the base `Memory.step_content` via `super().add_to_memory()` (regression guard).
         - The stored entry contains the correct importance score.
         - The entry is stamped with the current agent step.
         """
@@ -111,7 +110,7 @@ def test_finalize_entry_consistency(self, mock_agent):
         memory._finalize_entry("observation", graded_content)
 
         assert memory.memory_entries[0].content["observation"]["importance"] == 4
-        assert memory.step_content["observation"]["importance"] == 4
+        assert memory.step_content == {}
         assert isinstance(memory.memory_entries[0], MemoryEntry)
         assert memory.memory_entries[0].step == mock_agent.model.steps
 
@@ -170,38 +169,48 @@ def test_retrieve_top_k_importance_beats_recency(self, mock_agent):
 
     def test_process_step_pre_step(self, mock_agent):
         """
-        The process_step function in the episodic_memory when called with 'pre_step=True' takes whatever is already inside the step_content,
-        then calls the add_to_memory function and then clears the step_content.
-
-        This test function performs the following 2 tests,
-            - Checks whether the add_to_memory function is called correctly when 'pre_step=True.'
-            - Also performs a final check to ensure the step_content is cleared.
+        EpisodicMemory process_step is a no-op: it should not call add_to_memory
+        and should not mutate transient buffers.
         """
         memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
 
         # Pre-populate step_content
         memory.step_content = {"observation": {"data": "test"}}
 
-        # Spy on add_to_memory and call the process step with param as True
+        # Spy on add_to_memory and call process_step with pre_step=True
         memory.add_to_memory = MagicMock()
         memory.process_step(pre_step=True)
 
-        # Checks if add_to_memory was called once
-        memory.add_to_memory.assert_called_once_with(
-            type="observation",
-            content={"observation": {"data": "test"}},
-        )
+        memory.add_to_memory.assert_not_called()
 
-        # checks whether the step_content is cleared at the end
-        assert memory.step_content == {}
+        # no-op should keep buffer unchanged
+        assert memory.step_content == {"observation": {"data": "test"}}
+
+    def test_process_step_pre_step_does_not_append_entries(self, mock_agent):
+        """
+        Regression test: process_step/pre_step must not append synthetic entries.
+        """
+        memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
+
+        mock_response = MagicMock()
+        mock_response.choices[0].message.content = json.dumps({"grade": 3})
+        memory.llm.generate = MagicMock(return_value=mock_response)
+
+        memory.add_to_memory("action", {"action": "real event"})
+        existing_entries = list(memory.memory_entries)
+        assert len(existing_entries) == 1
+
+        memory.step_content = {"observation": {"data": "buffer-only"}}
+        memory.process_step(pre_step=True)
+
+        assert list(memory.memory_entries) == existing_entries
+        assert len(memory.memory_entries) == 1
+        assert memory.step_content == {"observation": {"data": "buffer-only"}}
 
     @pytest.mark.asyncio
     async def test_aprocess_step_pre_step(self, mock_agent):
         """
-        Asynchronous version of the 'test_process_step_pre_step'
-        Implements the same checks as the sync counterpart function but in an async manner.
-            - checks whether aadd_to_memory function was called correctly
-            - checks whether the step_content was cleared correctly at the end
+        Async process_step is also a no-op for episodic memory.
         """
         memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
 
@@ -210,13 +219,33 @@ async def test_aprocess_step_pre_step(self, mock_agent):
 
         await memory.aprocess_step(pre_step=True)
 
-        #
-        memory.aadd_to_memory.assert_awaited_once_with(
-            type="observation",
-            content={"observation": {"data": "test"}},
-        )
+        memory.aadd_to_memory.assert_not_awaited()
 
-        assert memory.step_content == {}
+        assert memory.step_content == {"observation": {"data": "test"}}
+
+    @pytest.mark.asyncio
+    async def test_aprocess_step_pre_step_does_not_append_entries(self, mock_agent):
+        """
+        Async regression test: process_step/pre_step must not append synthetic entries.
+        """
+        memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
+
+        mock_response = MagicMock()
+        mock_response.choices = [
+            MagicMock(message=MagicMock(content=json.dumps({"grade": 3})))
+        ]
+        memory.llm.agenerate = AsyncMock(return_value=mock_response)
+
+        await memory.aadd_to_memory("action", {"action": "real event"})
+        existing_entries = list(memory.memory_entries)
+        assert len(existing_entries) == 1
+
+        memory.step_content = {"observation": {"data": "buffer-only"}}
+        await memory.aprocess_step(pre_step=True)
+
+        assert list(memory.memory_entries) == existing_entries
+        assert len(memory.memory_entries) == 1
+        assert memory.step_content == {"observation": {"data": "buffer-only"}}
 
     @pytest.mark.asyncio
     async def test_async_add_memory_entry(self, mock_agent):
@@ -226,7 +255,7 @@ async def test_async_add_memory_entry(self, mock_agent):
         The test function does the following
             - mocks the llm to produece a pre-determined grading.
             - then calls the aad_to_memory function
-            - checks to ensure that the step_content is not empty as the aadd_to_memory function will have added entries into it.
+            - checks that entries are persisted directly into memory_entries.
         """
         memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model")
 
@@ -248,8 +277,7 @@ async def test_async_add_memory_entry(self, mock_agent):
         for new_entry in memory.memory_entries:
             event_type = next(iter(new_entry.content.keys()))
             assert new_entry.content[event_type]["importance"] == 3
-        # checks to ensure that step content is not empty
-        assert memory.step_content != {}
+        assert memory.step_content == {}
         assert len(memory.memory_entries) == 3, (
             "aadd_to_memory graded the event but never created a MemoryEntry"
         )

From 962f2774f08927cf0c7074d4327784790c2de777 Mon Sep 17 00:00:00 2001
From: Wang Boyu <boyu.wby@gmail.com>
Date: Thu, 5 Mar 2026 15:38:33 -0500
Subject: [PATCH 6/6] fix integration test for episodic memory

---
 .../test_integration/test_memory_reasoning.py | 40 +++++++++++--------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/tests/test_integration/test_memory_reasoning.py b/tests/test_integration/test_memory_reasoning.py
index f868e3fb..ae31c6f3 100644
--- a/tests/test_integration/test_memory_reasoning.py
+++ b/tests/test_integration/test_memory_reasoning.py
@@ -280,12 +280,14 @@ def test_plan_records_to_memory(self, monkeypatch):
         plan = reasoning.plan(obs=obs)
 
         assert isinstance(plan, Plan)
-        assert memory.step_content["Observation"]["content"] == str(obs)
-        assert memory.step_content["Plan"]["content"] == plan_content
-        assert memory.step_content["Plan-Execution"]["content"] == str(plan)
-        assert memory.step_content["Observation"]["importance"] == 3
-        assert memory.step_content["Plan"]["importance"] == 3
-        assert memory.step_content["Plan-Execution"]["importance"] == 3
+        entries = list(memory.memory_entries)
+        assert len(entries) == 3
+        assert entries[0].content["Observation"]["content"] == str(obs)
+        assert entries[1].content["Plan"]["content"] == plan_content
+        assert entries[2].content["Plan-Execution"]["content"] == str(plan)
+        assert entries[0].content["Observation"]["importance"] == 3
+        assert entries[1].content["Plan"]["importance"] == 3
+        assert entries[2].content["Plan-Execution"]["importance"] == 3
         assert memory.grade_event_importance.call_count == 3
 
     def test_async_plan_works(self, monkeypatch):
@@ -301,12 +303,14 @@ def test_async_plan_works(self, monkeypatch):
         plan = asyncio.run(reasoning.aplan(obs=obs))
 
         assert isinstance(plan, Plan)
-        assert memory.step_content["Observation"]["content"] == str(obs)
-        assert memory.step_content["Plan"]["content"] == plan_content
-        assert memory.step_content["Plan-Execution"]["content"] == str(plan)
-        assert memory.step_content["Observation"]["importance"] == 3
-        assert memory.step_content["Plan"]["importance"] == 3
-        assert memory.step_content["Plan-Execution"]["importance"] == 3
+        entries = list(memory.memory_entries)
+        assert len(entries) == 3
+        assert entries[0].content["Observation"]["content"] == str(obs)
+        assert entries[1].content["Plan"]["content"] == plan_content
+        assert entries[2].content["Plan-Execution"]["content"] == str(plan)
+        assert entries[0].content["Observation"]["importance"] == 3
+        assert entries[1].content["Plan"]["importance"] == 3
+        assert entries[2].content["Plan-Execution"]["importance"] == 3
         assert memory.agrade_event_importance.await_count == 3
 
 
@@ -677,8 +681,10 @@ def test_plan_records_to_memory(self, monkeypatch):
 
         plan = reasoning.plan()
         assert isinstance(plan, Plan)
-        assert memory.step_content["plan"]["content"] == plan_content
-        assert memory.step_content["plan"]["importance"] == 3
+        entries = list(memory.memory_entries)
+        assert len(entries) == 1
+        assert entries[0].content["plan"]["content"] == plan_content
+        assert entries[0].content["plan"]["importance"] == 3
         assert memory.grade_event_importance.call_count == 1
         reasoning.execute_tool_call.assert_called_once_with(
             plan_content, selected_tools=None, ttl=1
@@ -699,8 +705,10 @@ def test_async_plan_works(self, monkeypatch):
 
         plan = asyncio.run(reasoning.aplan())
         assert isinstance(plan, Plan)
-        assert memory.step_content["plan"]["content"] == plan_content
-        assert memory.step_content["plan"]["importance"] == 3
+        entries = list(memory.memory_entries)
+        assert len(entries) == 1
+        assert entries[0].content["plan"]["content"] == plan_content
+        assert entries[0].content["plan"]["importance"] == 3
         assert memory.grade_event_importance.call_count == 1
         reasoning.aexecute_tool_call.assert_awaited_once_with(
             plan_content, selected_tools=None, ttl=1