Skip to content

Commit 06ca986

Browse files
authored
feat: add memreader to tackle with internet (#154)
* feat: add memreader to tackle with internet * feat: delete chunker in internet retriever and add 'info' parameter in all retrievers * test: fix test for updated memreader * test: fix test for updated memreader * test: fix test for updated memreader
1 parent b946377 commit 06ca986

File tree

7 files changed

+38
-68
lines changed

7 files changed

+38
-68
lines changed

src/memos/configs/internet_retriever.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
from pydantic import Field, field_validator, model_validator
66

7-
from memos.chunkers.factory import ChunkerConfigFactory
87
from memos.configs.base import BaseConfig
98
from memos.exceptions import ConfigurationError
9+
from memos.mem_reader.factory import MemReaderConfigFactory
1010

1111

1212
class BaseInternetRetrieverConfig(BaseConfig):
@@ -48,10 +48,10 @@ class XinyuSearchConfig(BaseInternetRetrieverConfig):
4848
num_per_request: int = Field(
4949
default=10, description="Number of results per API request (not used for Xinyu)"
5050
)
51-
chunker: ChunkerConfigFactory = Field(
51+
reader: MemReaderConfigFactory = Field(
5252
...,
53-
default_factory=ChunkerConfigFactory,
54-
description="Chunker configuration",
53+
default_factory=MemReaderConfigFactory,
54+
description="Reader configuration",
5555
)
5656

5757

src/memos/mem_reader/simple_struct.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,12 @@ def get_scene_data_info(self, scene_data: list, type: str) -> list[str]:
180180
elif type == "doc":
181181
for item in scene_data:
182182
try:
183-
parsed_text = parser.parse(item)
184-
results.append({"file": item, "text": parsed_text})
183+
if not isinstance(item, str):
184+
parsed_text = parser.parse(item)
185+
results.append({"file": "pure_text", "text": parsed_text})
186+
else:
187+
parsed_text = item
188+
results.append({"file": item, "text": parsed_text})
185189
except Exception as e:
186190
print(f"Error parsing file {item}: {e!s}")
187191

src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def __init__(
127127
self.embedder = embedder
128128

129129
def retrieve_from_internet(
130-
self, query: str, top_k: int = 10, parsed_goal=None
130+
self, query: str, top_k: int = 10, parsed_goal=None, info=None
131131
) -> list[TextualMemoryItem]:
132132
"""
133133
Retrieve information from the internet and convert to TextualMemoryItem format
@@ -136,6 +136,7 @@ def retrieve_from_internet(
136136
query: Search query
137137
top_k: Number of results to return
138138
parsed_goal: Parsed task goal (optional)
139+
info (dict): Leave a record of memory consumption.
139140
140141
Returns:
141142
List of TextualMemoryItem
@@ -157,8 +158,8 @@ def retrieve_from_internet(
157158
memory_content = f"Title: {title}\nSummary: {snippet}\nSource: {link}"
158159
# Create metadata
159160
metadata = TreeNodeTextualMemoryMetadata(
160-
user_id=None,
161-
session_id=None,
161+
user_id=info.get("user_id", ""),
162+
session_id=info.get("session_id", ""),
162163
status="activated",
163164
type="fact", # Internet search results are usually factual information
164165
memory_time=datetime.now().strftime("%Y-%m-%d"),

src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
from typing import Any, ClassVar
44

5-
from memos.chunkers.factory import ChunkerFactory
65
from memos.configs.internet_retriever import InternetRetrieverConfigFactory
76
from memos.embedders.base import BaseEmbedder
7+
from memos.mem_reader.factory import MemReaderFactory
88
from memos.memories.textual.tree_text_memory.retrieve.internet_retriever import (
99
InternetGoogleRetriever,
1010
)
@@ -67,7 +67,7 @@ def from_config(
6767
access_key=config.api_key, # Use api_key as access_key for xinyu
6868
search_engine_id=config.search_engine_id,
6969
embedder=embedder,
70-
chunker=ChunkerFactory.from_config(config.chunker),
70+
reader=MemReaderFactory.from_config(config.reader),
7171
max_results=config.max_results,
7272
)
7373
else:

src/memos/memories/textual/tree_text_memory/retrieve/searcher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def retrieve_from_internet():
141141
if memory_type not in ["All"]:
142142
return []
143143
internet_items = self.internet_retriever.retrieve_from_internet(
144-
query=query, top_k=top_k, parsed_goal=parsed_goal
144+
query=query, top_k=top_k, parsed_goal=parsed_goal, info=info
145145
)
146146

147147
# Convert to the format expected by reranker

src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py

Lines changed: 18 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88

99
import requests
1010

11-
from memos.chunkers.base import BaseChunker
1211
from memos.embedders.factory import OllamaEmbedder
1312
from memos.log import get_logger
14-
from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemoryMetadata
13+
from memos.mem_reader.base import BaseMemReader
14+
from memos.memories.textual.item import TextualMemoryItem
1515

1616

1717
logger = get_logger(__name__)
@@ -114,7 +114,7 @@ def __init__(
114114
access_key: str,
115115
search_engine_id: str,
116116
embedder: OllamaEmbedder,
117-
chunker: BaseChunker,
117+
reader: BaseMemReader,
118118
max_results: int = 20,
119119
):
120120
"""
@@ -124,13 +124,14 @@ def __init__(
124124
access_key: Xinyu API access key
125125
embedder: Embedder instance for generating embeddings
126126
max_results: Maximum number of results to retrieve
127+
reader: MemReader Moduel to deal with internet contents
127128
"""
128129
self.xinyu_api = XinyuSearchAPI(access_key, search_engine_id, max_results=max_results)
129130
self.embedder = embedder
130-
self.chunker = chunker
131+
self.reader = reader
131132

132133
def retrieve_from_internet(
133-
self, query: str, top_k: int = 10, parsed_goal=None
134+
self, query: str, top_k: int = 10, parsed_goal=None, info=None
134135
) -> list[TextualMemoryItem]:
135136
"""
136137
Retrieve information from Xinyu search and convert to TextualMemoryItem format
@@ -139,7 +140,7 @@ def retrieve_from_internet(
139140
query: Search query
140141
top_k: Number of results to return
141142
parsed_goal: Parsed task goal (optional)
142-
143+
info (dict): Leave a record of memory consumption.
143144
Returns:
144145
List of TextualMemoryItem
145146
"""
@@ -151,7 +152,7 @@ def retrieve_from_internet(
151152

152153
with ThreadPoolExecutor(max_workers=8) as executor:
153154
futures = [
154-
executor.submit(self._process_result, result, query, parsed_goal)
155+
executor.submit(self._process_result, result, query, parsed_goal, info)
155156
for result in search_results
156157
]
157158
for future in as_completed(futures):
@@ -301,7 +302,7 @@ def _extract_tags(self, title: str, content: str, summary: str, parsed_goal=None
301302
return list(set(tags))[:15] # Limit to 15 tags
302303

303304
def _process_result(
304-
self, result: dict, query: str, parsed_goal: str
305+
self, result: dict, query: str, parsed_goal: str, info: dict
305306
) -> list[TextualMemoryItem]:
306307
title = result.get("title", "")
307308
content = result.get("content", "")
@@ -318,55 +319,19 @@ def _process_result(
318319
publish_time = datetime.now().strftime("%Y-%m-%d")
319320
else:
320321
publish_time = datetime.now().strftime("%Y-%m-%d")
321-
source = result.get("source", "")
322-
site = result.get("site", "")
323-
if site:
324-
site = site.split("|")[0]
325322

326-
qualified_chunks = self._chunk(content)
323+
read_items = self.reader.get_memory([content], type="doc", info=info)
327324

328325
memory_items = []
329-
for chunk_text, chunk_emb, score in qualified_chunks:
330-
memory_content = (
326+
for read_item_i in read_items[0]:
327+
read_item_i.memory = (
331328
f"Title: {title}\nNewsTime: {publish_time}\nSummary: {summary}\n"
332-
f"Content: {chunk_text}\nSource: {url}"
333-
)
334-
metadata = TreeNodeTextualMemoryMetadata(
335-
user_id=None,
336-
session_id=None,
337-
status="activated",
338-
type="fact",
339-
source="web",
340-
confidence=score,
341-
entities=self._extract_entities(title, content, summary),
342-
tags=self._extract_tags(title, content, summary, parsed_goal),
343-
visibility="public",
344-
memory_type="OuterMemory",
345-
key=f"[{source}]" + title,
346-
sources=[url] if url else [],
347-
embedding=chunk_emb,
348-
created_at=datetime.now().isoformat(),
349-
usage=[],
350-
background=f"Xinyu search result from {site or source}",
351-
)
352-
memory_items.append(
353-
TextualMemoryItem(id=str(uuid.uuid4()), memory=memory_content, metadata=metadata)
329+
f"Content: {read_item_i.memory}"
354330
)
331+
read_item_i.metadata.source = "web"
332+
read_item_i.metadata.memory_type = "OuterMemory"
333+
read_item_i.metadata.sources = [url] if url else []
334+
read_item_i.metadata.visibility = "public"
355335

336+
memory_items.append(read_item_i)
356337
return memory_items
357-
358-
def _chunk(self, content: str) -> list[tuple[str, list[float], float]]:
359-
"""
360-
Use SentenceChunker to split content into chunks and embed each.
361-
362-
Returns:
363-
List of (chunk_text, chunk_embedding, dummy_score)
364-
"""
365-
chunks = self.chunker.chunk(content)
366-
if not chunks:
367-
return []
368-
369-
chunk_texts = [c.text for c in chunks]
370-
chunk_embeddings = self.embedder.embed(chunk_texts)
371-
372-
return [(text, emb, 1.0) for text, emb in zip(chunk_texts, chunk_embeddings, strict=False)]

tests/mem_reader/test_simple_structure.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,18 @@ def test_get_scene_data_info_with_chat(self):
117117
self.assertEqual(len(result), 1)
118118
self.assertEqual(result[0][0], "user: [3 May 2025]: I'm feeling a bit down today.")
119119

120-
@patch("memos.parsers.factory.ParserFactory")
120+
@patch("memos.mem_reader.simple_struct.ParserFactory")
121121
def test_get_scene_data_info_with_doc(self, mock_parser_factory):
122122
"""Test parsing document files."""
123123
parser_instance = MagicMock()
124124
parser_instance.parse.return_value = "Parsed document text.\n"
125125
mock_parser_factory.from_config.return_value = parser_instance
126126

127-
scene_data = ["tests/mem_reader/test.txt"]
127+
scene_data = [{"fake_file_like": "should trigger parse"}]
128128
result = self.reader.get_scene_data_info(scene_data, type="doc")
129129

130130
self.assertIsInstance(result, list)
131-
self.assertEqual(result[0]["text"], "Parsed document text\n")
131+
self.assertEqual(result[0]["text"], "Parsed document text.\n")
132132

133133
def test_parse_json_result_success(self):
134134
"""Test successful JSON parsing."""

0 commit comments

Comments
 (0)