From 421d4c345c4dd0e3a48c3019e95b820bd9d9f074 Mon Sep 17 00:00:00 2001 From: Suraj Panickar Date: Sun, 1 Mar 2026 11:10:46 +0530 Subject: [PATCH 1/6] fix: store graded EpisodicMemory entries as MemoryEntry objects and use correct LLM instance --- mesa_llm/memory/episodic_memory.py | 33 ++++++++++++++++++----- tests/test_memory/test_episodic_memory.py | 16 ++++++++--- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/mesa_llm/memory/episodic_memory.py b/mesa_llm/memory/episodic_memory.py index d473cc6b..a635a5f3 100644 --- a/mesa_llm/memory/episodic_memory.py +++ b/mesa_llm/memory/episodic_memory.py @@ -89,7 +89,7 @@ def grade_event_importance(self, type: str, content: dict) -> float: prompt = self._build_grade_prompt(type, content) self.llm.system_prompt = self.system_prompt - rsp = self.agent.llm.generate( + rsp = self.llm.generate( prompt=prompt, response_format=EventGrade, ) @@ -104,7 +104,7 @@ async def agrade_event_importance(self, type: str, content: dict) -> float: prompt = self._build_grade_prompt(type, content) self.llm.system_prompt = self.system_prompt - rsp = await self.agent.llm.agenerate( + rsp = await self.llm.agenerate( prompt=prompt, response_format=EventGrade, ) @@ -126,17 +126,38 @@ def retrieve_top_k_entries(self, k: int) -> list[MemoryEntry]: def add_to_memory(self, type: str, content: dict): """ - Add a new memory entry to the memory + grading logic + Add a new memory entry to the memory """ - content["importance"] = self.grade_event_importance(type, content) + graded_content = { + **content, + "importance": self.grade_event_importance(type, content), + } + + new_entry = MemoryEntry( + agent=self.agent, + content={type: graded_content}, + step=self.agent.model.steps, + ) + self.memory_entries.append(new_entry) super().add_to_memory(type, content) async def aadd_to_memory(self, type: str, content: dict): """ - Async version of add_to_memory + Async version of add_to_memory + grading logic """ - content["importance"] = await self.agrade_event_importance(type, content) + graded_content = { + **content, + "importance": await self.agrade_event_importance(type, content), + } + + new_entry = MemoryEntry( + agent=self.agent, + content={type: graded_content}, + step=self.agent.model.steps, + ) + self.memory_entries.append(new_entry) + super().add_to_memory(type, content) def get_prompt_ready(self) -> str: diff --git a/tests/test_memory/test_episodic_memory.py b/tests/test_memory/test_episodic_memory.py index c8f91f69..f4fcd053 100644 --- a/tests/test_memory/test_episodic_memory.py +++ b/tests/test_memory/test_episodic_memory.py @@ -51,6 +51,10 @@ def test_add_memory_entry(self, mock_agent): """Test adding memories to Episodic memory""" memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") + mock_response = MagicMock() + mock_response.choices[0].message.content = json.dumps({"grade": 3}) + memory.llm.generate = MagicMock(return_value=mock_response) + # Test basic addition with observation memory.add_to_memory("observation", {"step": 1, "content": "Test content"}) @@ -62,6 +66,9 @@ def test_add_memory_entry(self, mock_agent): # Should be empty step_content initially assert memory.step_content != {} + assert len(memory.memory_entries) == 3, ( + "add_to_memory graded the event but never created a MemoryEntry" + ) def test_grade_event_importance(self, mock_agent): """Test grading event importance""" @@ -70,7 +77,7 @@ def test_grade_event_importance(self, mock_agent): # 1. Set up a specific grade for this test mock_response = MagicMock() mock_response.choices[0].message.content = json.dumps({"grade": 5}) - mock_agent.llm.generate.return_value = mock_response + memory.llm.generate = MagicMock(return_value=mock_response) # 2. Call the method grade = memory.grade_event_importance("observation", {"data": "critical info"}) @@ -79,7 +86,7 @@ def test_grade_event_importance(self, mock_agent): assert grade == 5 # 4. Assert the LLM was called correctly - mock_agent.llm.generate.assert_called_once() + memory.llm.generate.assert_called_once() # Check that the system prompt was set on the llm object assert memory.llm.system_prompt == memory.system_prompt @@ -192,7 +199,7 @@ async def test_async_add_memory_entry(self, mock_agent): ] # Assigns the mock response - mock_agent.llm.agenerate = AsyncMock(return_value=mock_response) + memory.llm.agenerate = AsyncMock(return_value=mock_response) # adds content into the memory using the async counter part of add_to_memory function await memory.aadd_to_memory("observation", {"content": "Test content"}) @@ -201,6 +208,9 @@ async def test_async_add_memory_entry(self, mock_agent): # checks to ensure that step content is not empty assert memory.step_content != {} + assert len(memory.memory_entries) == 3, ( + "aadd_to_memory graded the event but never created a MemoryEntry" + ) def test_build_grade_prompt_no_previous_entries(self, mock_agent): """ From d3db689258bd82bb504fbfa2dcc19b9efc0cfb33 Mon Sep 17 00:00:00 2001 From: Suraj Panickar Date: Mon, 2 Mar 2026 22:04:25 +0530 Subject: [PATCH 2/6] Refactored sync/async add to memory functions and updated tests. --- mesa_llm/memory/episodic_memory.py | 32 +++++++++-------------- tests/test_memory/test_episodic_memory.py | 23 ++++++++++++++++ 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/mesa_llm/memory/episodic_memory.py b/mesa_llm/memory/episodic_memory.py index a635a5f3..1c2dc7db 100644 --- a/mesa_llm/memory/episodic_memory.py +++ b/mesa_llm/memory/episodic_memory.py @@ -124,23 +124,25 @@ def retrieve_top_k_entries(self, k: int) -> list[MemoryEntry]: return top_list[:k] - def add_to_memory(self, type: str, content: dict): - """ - grading logic + Add a new memory entry to the memory - """ - graded_content = { - **content, - "importance": self.grade_event_importance(type, content), - } - + def _finalize_entry(self, type: str, graded_content: dict): + """Shared function for both sync/async add to memory which helps to create memory entry and stores it into a base class.""" new_entry = MemoryEntry( agent=self.agent, content={type: graded_content}, step=self.agent.model.steps, ) self.memory_entries.append(new_entry) + super().add_to_memory(type, graded_content) - super().add_to_memory(type, content) + def add_to_memory(self, type: str, content: dict): + """ + grading logic + adding to memory function call + """ + graded_content = { + **content, + "importance": self.grade_event_importance(type, content), + } + self._finalize_entry(type, graded_content) async def aadd_to_memory(self, type: str, content: dict): """ @@ -150,15 +152,7 @@ async def aadd_to_memory(self, type: str, content: dict): **content, "importance": await self.agrade_event_importance(type, content), } - - new_entry = MemoryEntry( - agent=self.agent, - content={type: graded_content}, - step=self.agent.model.steps, - ) - self.memory_entries.append(new_entry) - - super().add_to_memory(type, content) + self._finalize_entry(type, graded_content) def get_prompt_ready(self) -> str: return f"Top {self.considered_entries} memory entries:\n\n" + "\n".join( diff --git a/tests/test_memory/test_episodic_memory.py b/tests/test_memory/test_episodic_memory.py index f4fcd053..d7300b40 100644 --- a/tests/test_memory/test_episodic_memory.py +++ b/tests/test_memory/test_episodic_memory.py @@ -70,6 +70,24 @@ def test_add_memory_entry(self, mock_agent): "add_to_memory graded the event but never created a MemoryEntry" ) + def test_finalize_entry_consistency(self, mock_agent): + """Minimal tests for the helper function _finalize_event_entry(). + - This test ensures that: + - A `MemoryEntry` object is created and stored in episodic memory. + - The graded content is forwarded to the base `Memory.step_content` via `super().add_to_memory()` (regression guard). + - The stored entry contains the correct importance score. + - The entry is stamped with the current agent step. + """ + memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") + graded_content = {"data": "test", "importance": 4} + + memory._finalize_entry("observation", graded_content) + + assert memory.memory_entries[0].content["observation"]["importance"] == 4 + assert memory.step_content["observation"]["importance"] == 4 + assert isinstance(memory.memory_entries[0], MemoryEntry) + assert memory.memory_entries[0].step == mock_agent.model.steps + def test_grade_event_importance(self, mock_agent): """Test grading event importance""" memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") @@ -206,6 +224,11 @@ async def test_async_add_memory_entry(self, mock_agent): await memory.aadd_to_memory("planning", {"plan": "Test plan"}) await memory.aadd_to_memory("action", {"action": "Test action"}) + new_entry = memory.memory_entries[0] + + for new_entry in memory.memory_entries: + event_type = next(iter(new_entry.content.keys())) + assert new_entry.content[event_type]["importance"] == 3 # checks to ensure that step content is not empty assert memory.step_content != {} assert len(memory.memory_entries) == 3, ( From 8451485e423bff7e451c5b3d954c4151c8abf6d1 Mon Sep 17 00:00:00 2001 From: Suraj Panickar Date: Wed, 4 Mar 2026 20:55:46 +0530 Subject: [PATCH 3/6] Improved grading logic(recency + importance) added tests. --- mesa_llm/memory/episodic_memory.py | 81 +++++++++++++++++++++-- tests/test_memory/test_episodic_memory.py | 81 ++++++++++++++--------- 2 files changed, 125 insertions(+), 37 deletions(-) diff --git a/mesa_llm/memory/episodic_memory.py b/mesa_llm/memory/episodic_memory.py index 1c2dc7db..b653c11c 100644 --- a/mesa_llm/memory/episodic_memory.py +++ b/mesa_llm/memory/episodic_memory.py @@ -14,6 +14,32 @@ class EventGrade(BaseModel): grade: int +def normalize_dict_values(scores: dict, min_target: float, max_target: float) -> dict: + """ + normalise the values in the given using min-max scaling to the given range. + """ + if not scores: + return {} + + vals = list(scores.values()) + min_val = min(vals) + max_val = max(vals) + + range_val = max_val - min_val + + if range_val == 0: + midpoint = (max_target - min_target) / 2 + min_target + for key in scores: + scores[key] = midpoint + else: + for key, val in scores.items(): + scores[key] = (val - min_val) * ( + max_target - min_target + ) / range_val + min_target + + return scores + + class EpisodicMemory(Memory): """ Stores memories based on event importance scoring. Each new memory entry is evaluated by a LLM @@ -29,6 +55,7 @@ def __init__( display: bool = True, max_capacity: int = 10, considered_entries: int = 5, + recency_decay: float = 0.995, ): """ Initialize the EpisodicMemory @@ -43,6 +70,7 @@ def __init__( self.max_capacity = max_capacity self.memory_entries = deque(maxlen=self.max_capacity) self.considered_entries = considered_entries + self.recency_decay = recency_decay self.system_prompt = """ You are an assistant that evaluates memory entries on a scale from 1 to 5, based on their importance to a specific problem or task. Your goal is to assign a score that reflects how much each entry contributes to understanding, solving, or advancing the task. Use the following grading scale: @@ -60,6 +88,24 @@ def __init__( Only assess based on the entry's content and its value to the task at hand. Ignore style, grammar, or tone. """ + def _extract_importance(self, entry) -> int: + """ + Safely extracts importance score regardless of data structure. + Handles: + - Nested: {"msg": {"importance": 5}} + - Flat: {"importance": 5} + """ + if "importance" in entry.content: + val = entry.content["importance"] + return val if isinstance(val, (int, float)) else 1 + + for value in entry.content.values(): + if isinstance(value, dict) and "importance" in value: + val = value["importance"] + return val if isinstance(val, (int, float)) else 1 + + return 1 + def _build_grade_prompt(self, type: str, content: dict) -> str: """ This helper assembles a prompt that includes the event type, event content, @@ -114,15 +160,36 @@ async def agrade_event_importance(self, type: str, content: dict) -> float: def retrieve_top_k_entries(self, k: int) -> list[MemoryEntry]: """ - Retrieve the top k entries based on the importance and recency + Retrieve the top k entries based on the importance and recency (Releveance is yet to be added.) + - Uses min-max normlaizations to convert both importance and recency inorder to avoid large diffs in values. + - Computes total score by adding the normalised importance and recency values. + - Returns the list of entries in the final_score list """ - top_list = sorted( - self.memory_entries, - key=lambda x: x.content["importance"] - (self.agent.model.steps - x.step), - reverse=True, - ) + if not self.memory_entries: + return [] + + importance_dict = {} + recency_dict = {} + + entries = list(self.memory_entries) + current_step = self.agent.model.steps + + for i, entry in enumerate(entries): + importance_dict[i] = self._extract_importance(entry) + + age = current_step - entry.step + recency_dict[i] = self.recency_decay**age + + importance_scaled = normalize_dict_values(importance_dict, 0, 1) + recency_scaled = normalize_dict_values(recency_dict, 0, 1) + + final_scores = [] + for i in range(len(entries)): + total_score = importance_scaled[i] + recency_scaled[i] + final_scores.append((total_score, entries[i])) - return top_list[:k] + final_scores.sort(key=lambda x: x[0], reverse=True) + return [entry for _, entry in final_scores[:k]] def _finalize_entry(self, type: str, graded_content: dict): """Shared function for both sync/async add to memory which helps to create memory entry and stores it into a base class.""" diff --git a/tests/test_memory/test_episodic_memory.py b/tests/test_memory/test_episodic_memory.py index d7300b40..61a988b9 100644 --- a/tests/test_memory/test_episodic_memory.py +++ b/tests/test_memory/test_episodic_memory.py @@ -4,7 +4,7 @@ import pytest -from mesa_llm.memory.episodic_memory import EpisodicMemory +from mesa_llm.memory.episodic_memory import EpisodicMemory, normalize_dict_values from mesa_llm.memory.memory import MemoryEntry @@ -26,6 +26,25 @@ def mock_agent(): return agent +def test_normalize_dict_floats_logic(): + """ + Function to check whether the values are normalised properly. + - Hardcoded dict values are used currently to ensure that the normalization logic works. + - Checks both cases, ie when the range = 0 and when its not 0. + """ + d = {0: 10, 1: 20, 2: 30} + norm = normalize_dict_values(d, 0, 1) + assert norm[0] == 0.0 + assert norm[1] == 0.5 + assert norm[2] == 1.0 + + # Checks normalized value when range is 0 + d_tie = {0: 5, 1: 5} + norm_tie = normalize_dict_values(d_tie, 0, 1) + assert norm_tie[0] == 0.5 + assert norm_tie[1] == 0.5 + + class TestEpisodicMemory: """Core functionality test""" @@ -109,45 +128,37 @@ def test_grade_event_importance(self, mock_agent): # Check that the system prompt was set on the llm object assert memory.llm.system_prompt == memory.system_prompt - def test_retrieve_top_k_entries(self, mock_agent): - """Test the sorting logic for retrieving entries (importance - recency_penalty).""" + def test_retrieve_top_k_importance_beats_recency(self, mock_agent): + """ + Function Verify that a highly important but older memory can outrank + a recent but low-importance memory after normalization. + """ + memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") - # Set current step - mock_agent.model.steps = 100 - # Manually add entries to bypass grading and control scores - # score = importance - (current_step - entry_step) + mock_agent.model.steps = 100 - # score = 5 - (100 - 98) = 3 + # Very important but old entry_a = MemoryEntry( - content={"importance": 5, "id": "A"}, step=98, agent=mock_agent - ) - # score = 1 - (100 - 99) = 0 - entry_b = MemoryEntry( - content={"importance": 1, "id": "B"}, step=99, agent=mock_agent + content={"message": {"importance": 5, "info": "The meaning of life"}}, + step=80, + agent=mock_agent, ) - # score = 4 - (100 - 90) = -6 + + # Very recent but unimportant entry_c = MemoryEntry( - content={"importance": 4, "id": "C"}, step=90, agent=mock_agent - ) - # score = 4 - (100 - 95) = -1 - entry_d = MemoryEntry( - content={"importance": 4, "id": "D"}, step=95, agent=mock_agent + content={"message": {"importance": 1, "info": "I saw a bird"}}, + step=99, + agent=mock_agent, ) - memory.memory_entries.extend([entry_a, entry_b, entry_c, entry_d]) + memory.memory_entries.extend([entry_a, entry_c]) - # Retrieve top 3 (k=3) - top_entries = memory.retrieve_top_k_entries(3) + top_entries = memory.retrieve_top_k_entries(1) - # Expected order: A (3), B (0), D (-1) - assert len(top_entries) == 3 - assert top_entries[0].content["id"] == "A" - assert top_entries[1].content["id"] == "B" - assert top_entries[2].content["id"] == "D" - - # Entry C (score -6) should be omitted - assert "C" not in [e.content["id"] for e in top_entries] + # The highly important memory should win + assert len(top_entries) == 1 + assert top_entries[0] == entry_a def test_process_step_pre_step(self, mock_agent): """ @@ -290,3 +301,13 @@ def test_get_communication_history(self, mock_agent): assert ( "No message here" not in history ) # step 2 does not have message field thus it must not be present in the returned string + + def test_retrieve_empty_memory(self, mock_agent): + """ + Function to verify empty list is returned when retrieval of memory is empty + """ + memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") + + result = memory.retrieve_top_k_entries(3) + + assert result == [] From 35995ec6ad713c5406faf7c35ea6b061b0003bd7 Mon Sep 17 00:00:00 2001 From: Suraj Panickar Date: Thu, 5 Mar 2026 01:20:58 +0530 Subject: [PATCH 4/6] Imporved codecov coverage. --- tests/test_memory/test_episodic_memory.py | 44 +++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/test_memory/test_episodic_memory.py b/tests/test_memory/test_episodic_memory.py index 61a988b9..29237ca6 100644 --- a/tests/test_memory/test_episodic_memory.py +++ b/tests/test_memory/test_episodic_memory.py @@ -45,6 +45,14 @@ def test_normalize_dict_floats_logic(): assert norm_tie[1] == 0.5 +def test_normalize_dict_floats_logic_when_empty(): + """ + Function to check whether normalize_dict_values correctly returns an empty dict + """ + norm = normalize_dict_values({}, 0, 1) + assert norm == {} + + class TestEpisodicMemory: """Core functionality test""" @@ -311,3 +319,39 @@ def test_retrieve_empty_memory(self, mock_agent): result = memory.retrieve_top_k_entries(3) assert result == [] + + def test_extract_importance_flat(self, mock_agent): + """Function to return importance when stored at top level""" + memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") + entry = MemoryEntry( + content={"importance": 5, "message": "hello"}, + step=1, + agent=mock_agent, + ) + result = memory._extract_importance(entry) + assert result == 5 + + def test_extract_importance_nested(self, mock_agent): + """Should return importance when nested inside another dict""" + memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") + + entry = MemoryEntry( + content={"message": {"importance": 4, "text": "nested"}}, + step=1, + agent=mock_agent, + ) + result = memory._extract_importance(entry) + + assert result == 4 + + def test_extract_importance_missing(self, mock_agent): + """Should fallback to 1 when importance is absent""" + memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") + + entry = MemoryEntry( + content={"message": {"text": "no importance"}}, + step=1, + agent=mock_agent, + ) + result = memory._extract_importance(entry) + assert result == 1 From ac90308409ccbf127a6ed87ddf84dde31772dd69 Mon Sep 17 00:00:00 2001 From: Wang Boyu Date: Thu, 5 Mar 2026 15:18:55 -0500 Subject: [PATCH 5/6] skip pre/post step processing of memory entries in episodic memory --- mesa_llm/memory/episodic_memory.py | 62 +++++++++------ tests/test_memory/test_episodic_memory.py | 92 +++++++++++++++-------- 2 files changed, 97 insertions(+), 57 deletions(-) diff --git a/mesa_llm/memory/episodic_memory.py b/mesa_llm/memory/episodic_memory.py index b653c11c..629c0ba7 100644 --- a/mesa_llm/memory/episodic_memory.py +++ b/mesa_llm/memory/episodic_memory.py @@ -16,7 +16,11 @@ class EventGrade(BaseModel): def normalize_dict_values(scores: dict, min_target: float, max_target: float) -> dict: """ - normalise the values in the given using min-max scaling to the given range. + Normalize dictionary values to a target range with min-max scaling. + + This mirrors the min-max helper used in the Generative Agents reference + retrieval implementation: + https://github.com/joonspk-research/generative_agents/blob/main/reverie/backend_server/persona/cognitive_modules/retrieve.py """ if not scores: return {} @@ -42,10 +46,18 @@ def normalize_dict_values(scores: dict, min_target: float, max_target: float) -> class EpisodicMemory(Memory): """ - Stores memories based on event importance scoring. Each new memory entry is evaluated by a LLM - for its relevance and importance (1-5 scale) relative to the agent's current task and previous - experiences. Based on a Stanford/DeepMind paper: - [Generative Agents: Interactive Simulacra of Human Behavior](https://arxiv.org/pdf/2304.03442) + Event-level memory with LLM-based importance scoring and recency-aware retrieval. + + Credit / references: + - Paper: Generative Agents: Interactive Simulacra of Human Behavior + https://arxiv.org/abs/2304.03442 + - Reference retrieval code: + https://github.com/joonspk-research/generative_agents/blob/main/reverie/backend_server/persona/cognitive_modules/retrieve.py + + This implementation is inspired by the paper's retrieval scoring design + (component-wise min-max normalization, then weighted combination). It is + not a strict copy of the original code: relevance scoring via embeddings is + not implemented yet, and recency is computed from step age. """ def __init__( @@ -53,8 +65,8 @@ def __init__( agent: "LLMAgent", llm_model: str | None = None, display: bool = True, - max_capacity: int = 10, - considered_entries: int = 5, + max_capacity: int = 200, + considered_entries: int = 30, recency_decay: float = 0.995, ): """ @@ -160,10 +172,13 @@ async def agrade_event_importance(self, type: str, content: dict) -> float: def retrieve_top_k_entries(self, k: int) -> list[MemoryEntry]: """ - Retrieve the top k entries based on the importance and recency (Releveance is yet to be added.) - - Uses min-max normlaizations to convert both importance and recency inorder to avoid large diffs in values. - - Computes total score by adding the normalised importance and recency values. - - Returns the list of entries in the final_score list + Retrieve the top-k entries using normalized importance and recency. + + Notes: + - Inspired by Generative Agents retrieval scoring: + recency/importance/relevance are normalized separately and combined. + - This implementation currently combines importance + recency only. + Relevance (embedding cosine similarity with a focal query) is pending. """ if not self.memory_entries: return [] @@ -192,14 +207,13 @@ def retrieve_top_k_entries(self, k: int) -> list[MemoryEntry]: return [entry for _, entry in final_scores[:k]] def _finalize_entry(self, type: str, graded_content: dict): - """Shared function for both sync/async add to memory which helps to create memory entry and stores it into a base class.""" + """Create and persist a finalized episodic entry.""" new_entry = MemoryEntry( agent=self.agent, content={type: graded_content}, step=self.agent.model.steps, ) self.memory_entries.append(new_entry) - super().add_to_memory(type, graded_content) def add_to_memory(self, type: str, content: dict): """ @@ -243,20 +257,18 @@ def get_communication_history(self) -> str: async def aprocess_step(self, pre_step: bool = False): """ - Asynchronous version of process_step + Asynchronous version of process_step. + + EpisodicMemory persists entries at add-time and does not use two-phase + pre/post-step buffering. """ - if pre_step: - await self.aadd_to_memory(type="observation", content=self.step_content) - self.step_content = {} - return + return def process_step(self, pre_step: bool = False): """ - Process the step of the agent : - - Add the new entry to the memory - - Display the new entry + Process step hook (no-op for episodic memory). + + EpisodicMemory persists entries at add-time and does not use two-phase + pre/post-step buffering. """ - if pre_step: - self.add_to_memory(type="observation", content=self.step_content) - self.step_content = {} - return + return diff --git a/tests/test_memory/test_episodic_memory.py b/tests/test_memory/test_episodic_memory.py index 29237ca6..ad841fee 100644 --- a/tests/test_memory/test_episodic_memory.py +++ b/tests/test_memory/test_episodic_memory.py @@ -91,17 +91,16 @@ def test_add_memory_entry(self, mock_agent): # Test with action memory.add_to_memory("action", {"action": "Test action"}) - # Should be empty step_content initially - assert memory.step_content != {} + # EpisodicMemory should not rely on transient step buffers. + assert memory.step_content == {} assert len(memory.memory_entries) == 3, ( "add_to_memory graded the event but never created a MemoryEntry" ) def test_finalize_entry_consistency(self, mock_agent): - """Minimal tests for the helper function _finalize_event_entry(). + """Minimal tests for the helper function _finalize_entry(). - This test ensures that: - A `MemoryEntry` object is created and stored in episodic memory. - - The graded content is forwarded to the base `Memory.step_content` via `super().add_to_memory()` (regression guard). - The stored entry contains the correct importance score. - The entry is stamped with the current agent step. """ @@ -111,7 +110,7 @@ def test_finalize_entry_consistency(self, mock_agent): memory._finalize_entry("observation", graded_content) assert memory.memory_entries[0].content["observation"]["importance"] == 4 - assert memory.step_content["observation"]["importance"] == 4 + assert memory.step_content == {} assert isinstance(memory.memory_entries[0], MemoryEntry) assert memory.memory_entries[0].step == mock_agent.model.steps @@ -170,38 +169,48 @@ def test_retrieve_top_k_importance_beats_recency(self, mock_agent): def test_process_step_pre_step(self, mock_agent): """ - The process_step function in the episodic_memory when called with 'pre_step=True' takes whatever is already inside the step_content, - then calls the add_to_memory function and then clears the step_content. - - This test function performs the following 2 tests, - - Checks whether the add_to_memory function is called correctly when 'pre_step=True.' - - Also performs a final check to ensure the step_content is cleared. + EpisodicMemory process_step is a no-op: it should not call add_to_memory + and should not mutate transient buffers. """ memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") # Pre-populate step_content memory.step_content = {"observation": {"data": "test"}} - # Spy on add_to_memory and call the process step with param as True + # Spy on add_to_memory and call process_step with pre_step=True memory.add_to_memory = MagicMock() memory.process_step(pre_step=True) - # Checks if add_to_memory was called once - memory.add_to_memory.assert_called_once_with( - type="observation", - content={"observation": {"data": "test"}}, - ) + memory.add_to_memory.assert_not_called() - # checks whether the step_content is cleared at the end - assert memory.step_content == {} + # no-op should keep buffer unchanged + assert memory.step_content == {"observation": {"data": "test"}} + + def test_process_step_pre_step_does_not_append_entries(self, mock_agent): + """ + Regression test: process_step/pre_step must not append synthetic entries. + """ + memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") + + mock_response = MagicMock() + mock_response.choices[0].message.content = json.dumps({"grade": 3}) + memory.llm.generate = MagicMock(return_value=mock_response) + + memory.add_to_memory("action", {"action": "real event"}) + existing_entries = list(memory.memory_entries) + assert len(existing_entries) == 1 + + memory.step_content = {"observation": {"data": "buffer-only"}} + memory.process_step(pre_step=True) + + assert list(memory.memory_entries) == existing_entries + assert len(memory.memory_entries) == 1 + assert memory.step_content == {"observation": {"data": "buffer-only"}} @pytest.mark.asyncio async def test_aprocess_step_pre_step(self, mock_agent): """ - Asynchronous version of the 'test_process_step_pre_step' - Implements the same checks as the sync counterpart function but in an async manner. - - checks whether aadd_to_memory function was called correctly - - checks whether the step_content was cleared correctly at the end + Async process_step is also a no-op for episodic memory. """ memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") @@ -210,13 +219,33 @@ async def test_aprocess_step_pre_step(self, mock_agent): await memory.aprocess_step(pre_step=True) - # - memory.aadd_to_memory.assert_awaited_once_with( - type="observation", - content={"observation": {"data": "test"}}, - ) + memory.aadd_to_memory.assert_not_awaited() - assert memory.step_content == {} + assert memory.step_content == {"observation": {"data": "test"}} + + @pytest.mark.asyncio + async def test_aprocess_step_pre_step_does_not_append_entries(self, mock_agent): + """ + Async regression test: process_step/pre_step must not append synthetic entries. + """ + memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") + + mock_response = MagicMock() + mock_response.choices = [ + MagicMock(message=MagicMock(content=json.dumps({"grade": 3}))) + ] + memory.llm.agenerate = AsyncMock(return_value=mock_response) + + await memory.aadd_to_memory("action", {"action": "real event"}) + existing_entries = list(memory.memory_entries) + assert len(existing_entries) == 1 + + memory.step_content = {"observation": {"data": "buffer-only"}} + await memory.aprocess_step(pre_step=True) + + assert list(memory.memory_entries) == existing_entries + assert len(memory.memory_entries) == 1 + assert memory.step_content == {"observation": {"data": "buffer-only"}} @pytest.mark.asyncio async def test_async_add_memory_entry(self, mock_agent): @@ -226,7 +255,7 @@ async def test_async_add_memory_entry(self, mock_agent): The test function does the following - mocks the llm to produece a pre-determined grading. - then calls the aad_to_memory function - - checks to ensure that the step_content is not empty as the aadd_to_memory function will have added entries into it. + - checks that entries are persisted directly into memory_entries. """ memory = EpisodicMemory(agent=mock_agent, llm_model="provider/test_model") @@ -248,8 +277,7 @@ async def test_async_add_memory_entry(self, mock_agent): for new_entry in memory.memory_entries: event_type = next(iter(new_entry.content.keys())) assert new_entry.content[event_type]["importance"] == 3 - # checks to ensure that step content is not empty - assert memory.step_content != {} + assert memory.step_content == {} assert len(memory.memory_entries) == 3, ( "aadd_to_memory graded the event but never created a MemoryEntry" ) From 962f2774f08927cf0c7074d4327784790c2de777 Mon Sep 17 00:00:00 2001 From: Wang Boyu Date: Thu, 5 Mar 2026 15:38:33 -0500 Subject: [PATCH 6/6] fix integration test for episodic memory --- .../test_integration/test_memory_reasoning.py | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/tests/test_integration/test_memory_reasoning.py b/tests/test_integration/test_memory_reasoning.py index f868e3fb..ae31c6f3 100644 --- a/tests/test_integration/test_memory_reasoning.py +++ b/tests/test_integration/test_memory_reasoning.py @@ -280,12 +280,14 @@ def test_plan_records_to_memory(self, monkeypatch): plan = reasoning.plan(obs=obs) assert isinstance(plan, Plan) - assert memory.step_content["Observation"]["content"] == str(obs) - assert memory.step_content["Plan"]["content"] == plan_content - assert memory.step_content["Plan-Execution"]["content"] == str(plan) - assert memory.step_content["Observation"]["importance"] == 3 - assert memory.step_content["Plan"]["importance"] == 3 - assert memory.step_content["Plan-Execution"]["importance"] == 3 + entries = list(memory.memory_entries) + assert len(entries) == 3 + assert entries[0].content["Observation"]["content"] == str(obs) + assert entries[1].content["Plan"]["content"] == plan_content + assert entries[2].content["Plan-Execution"]["content"] == str(plan) + assert entries[0].content["Observation"]["importance"] == 3 + assert entries[1].content["Plan"]["importance"] == 3 + assert entries[2].content["Plan-Execution"]["importance"] == 3 assert memory.grade_event_importance.call_count == 3 def test_async_plan_works(self, monkeypatch): @@ -301,12 +303,14 @@ def test_async_plan_works(self, monkeypatch): plan = asyncio.run(reasoning.aplan(obs=obs)) assert isinstance(plan, Plan) - assert memory.step_content["Observation"]["content"] == str(obs) - assert memory.step_content["Plan"]["content"] == plan_content - assert memory.step_content["Plan-Execution"]["content"] == str(plan) - assert memory.step_content["Observation"]["importance"] == 3 - assert memory.step_content["Plan"]["importance"] == 3 - assert memory.step_content["Plan-Execution"]["importance"] == 3 + entries = list(memory.memory_entries) + assert len(entries) == 3 + assert entries[0].content["Observation"]["content"] == str(obs) + assert entries[1].content["Plan"]["content"] == plan_content + assert entries[2].content["Plan-Execution"]["content"] == str(plan) + assert entries[0].content["Observation"]["importance"] == 3 + assert entries[1].content["Plan"]["importance"] == 3 + assert entries[2].content["Plan-Execution"]["importance"] == 3 assert memory.agrade_event_importance.await_count == 3 @@ -677,8 +681,10 @@ def test_plan_records_to_memory(self, monkeypatch): plan = reasoning.plan() assert isinstance(plan, Plan) - assert memory.step_content["plan"]["content"] == plan_content - assert memory.step_content["plan"]["importance"] == 3 + entries = list(memory.memory_entries) + assert len(entries) == 1 + assert entries[0].content["plan"]["content"] == plan_content + assert entries[0].content["plan"]["importance"] == 3 assert memory.grade_event_importance.call_count == 1 reasoning.execute_tool_call.assert_called_once_with( plan_content, selected_tools=None, ttl=1 @@ -699,8 +705,10 @@ def test_async_plan_works(self, monkeypatch): plan = asyncio.run(reasoning.aplan()) assert isinstance(plan, Plan) - assert memory.step_content["plan"]["content"] == plan_content - assert memory.step_content["plan"]["importance"] == 3 + entries = list(memory.memory_entries) + assert len(entries) == 1 + assert entries[0].content["plan"]["content"] == plan_content + assert entries[0].content["plan"]["importance"] == 3 assert memory.grade_event_importance.call_count == 1 reasoning.aexecute_tool_call.assert_awaited_once_with( plan_content, selected_tools=None, ttl=1