Skip to content

Commit 0bdd54b

Browse files
committed
feat: add memreader to tackle with internet
1 parent b946377 commit 0bdd54b

File tree

5 files changed

+32
-53
lines changed

5 files changed

+32
-53
lines changed

src/memos/configs/internet_retriever.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from memos.chunkers.factory import ChunkerConfigFactory
88
from memos.configs.base import BaseConfig
99
from memos.exceptions import ConfigurationError
10+
from memos.mem_reader.factory import MemReaderConfigFactory
1011

1112

1213
class BaseInternetRetrieverConfig(BaseConfig):
@@ -53,6 +54,11 @@ class XinyuSearchConfig(BaseInternetRetrieverConfig):
5354
default_factory=ChunkerConfigFactory,
5455
description="Chunker configuration",
5556
)
57+
reader: MemReaderConfigFactory = Field(
58+
...,
59+
default_factory=MemReaderConfigFactory,
60+
description="Reader configuration",
61+
)
5662

5763

5864
class InternetRetrieverConfigFactory(BaseConfig):

src/memos/mem_reader/simple_struct.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,12 @@ def get_scene_data_info(self, scene_data: list, type: str) -> list[str]:
180180
elif type == "doc":
181181
for item in scene_data:
182182
try:
183-
parsed_text = parser.parse(item)
184-
results.append({"file": item, "text": parsed_text})
183+
if not isinstance(item, str):
184+
parsed_text = parser.parse(item)
185+
results.append({"file": "pure_text", "text": parsed_text})
186+
else:
187+
parsed_text = item
188+
results.append({"file": item, "text": parsed_text})
185189
except Exception as e:
186190
print(f"Error parsing file {item}: {e!s}")
187191

src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from memos.chunkers.factory import ChunkerFactory
66
from memos.configs.internet_retriever import InternetRetrieverConfigFactory
77
from memos.embedders.base import BaseEmbedder
8+
from memos.mem_reader.factory import MemReaderFactory
89
from memos.memories.textual.tree_text_memory.retrieve.internet_retriever import (
910
InternetGoogleRetriever,
1011
)
@@ -68,6 +69,7 @@ def from_config(
6869
search_engine_id=config.search_engine_id,
6970
embedder=embedder,
7071
chunker=ChunkerFactory.from_config(config.chunker),
72+
reader=MemReaderFactory.from_config(config.reader),
7173
max_results=config.max_results,
7274
)
7375
else:

src/memos/memories/textual/tree_text_memory/retrieve/searcher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def retrieve_from_internet():
141141
if memory_type not in ["All"]:
142142
return []
143143
internet_items = self.internet_retriever.retrieve_from_internet(
144-
query=query, top_k=top_k, parsed_goal=parsed_goal
144+
query=query, top_k=top_k, parsed_goal=parsed_goal, info=info
145145
)
146146

147147
# Convert to the format expected by reranker

src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py

Lines changed: 17 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
from memos.chunkers.base import BaseChunker
1212
from memos.embedders.factory import OllamaEmbedder
1313
from memos.log import get_logger
14-
from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemoryMetadata
14+
from memos.mem_reader.base import BaseMemReader
15+
from memos.memories.textual.item import TextualMemoryItem
1516

1617

1718
logger = get_logger(__name__)
@@ -115,6 +116,7 @@ def __init__(
115116
search_engine_id: str,
116117
embedder: OllamaEmbedder,
117118
chunker: BaseChunker,
119+
reader: BaseMemReader,
118120
max_results: int = 20,
119121
):
120122
"""
@@ -128,9 +130,10 @@ def __init__(
128130
self.xinyu_api = XinyuSearchAPI(access_key, search_engine_id, max_results=max_results)
129131
self.embedder = embedder
130132
self.chunker = chunker
133+
self.reader = reader
131134

132135
def retrieve_from_internet(
133-
self, query: str, top_k: int = 10, parsed_goal=None
136+
self, query: str, top_k: int = 10, parsed_goal=None, info=None
134137
) -> list[TextualMemoryItem]:
135138
"""
136139
Retrieve information from Xinyu search and convert to TextualMemoryItem format
@@ -139,7 +142,7 @@ def retrieve_from_internet(
139142
query: Search query
140143
top_k: Number of results to return
141144
parsed_goal: Parsed task goal (optional)
142-
145+
info (dict): Leave a record of memory consumption.
143146
Returns:
144147
List of TextualMemoryItem
145148
"""
@@ -151,7 +154,7 @@ def retrieve_from_internet(
151154

152155
with ThreadPoolExecutor(max_workers=8) as executor:
153156
futures = [
154-
executor.submit(self._process_result, result, query, parsed_goal)
157+
executor.submit(self._process_result, result, query, parsed_goal, info)
155158
for result in search_results
156159
]
157160
for future in as_completed(futures):
@@ -301,7 +304,7 @@ def _extract_tags(self, title: str, content: str, summary: str, parsed_goal=None
301304
return list(set(tags))[:15] # Limit to 15 tags
302305

303306
def _process_result(
304-
self, result: dict, query: str, parsed_goal: str
307+
self, result: dict, query: str, parsed_goal: str, info: dict
305308
) -> list[TextualMemoryItem]:
306309
title = result.get("title", "")
307310
content = result.get("content", "")
@@ -318,55 +321,19 @@ def _process_result(
318321
publish_time = datetime.now().strftime("%Y-%m-%d")
319322
else:
320323
publish_time = datetime.now().strftime("%Y-%m-%d")
321-
source = result.get("source", "")
322-
site = result.get("site", "")
323-
if site:
324-
site = site.split("|")[0]
325324

326-
qualified_chunks = self._chunk(content)
325+
read_items = self.reader.get_memory([content], type="doc", info=info)
327326

328327
memory_items = []
329-
for chunk_text, chunk_emb, score in qualified_chunks:
330-
memory_content = (
328+
for read_item_i in read_items[0]:
329+
read_item_i.memory = (
331330
f"Title: {title}\nNewsTime: {publish_time}\nSummary: {summary}\n"
332-
f"Content: {chunk_text}\nSource: {url}"
333-
)
334-
metadata = TreeNodeTextualMemoryMetadata(
335-
user_id=None,
336-
session_id=None,
337-
status="activated",
338-
type="fact",
339-
source="web",
340-
confidence=score,
341-
entities=self._extract_entities(title, content, summary),
342-
tags=self._extract_tags(title, content, summary, parsed_goal),
343-
visibility="public",
344-
memory_type="OuterMemory",
345-
key=f"[{source}]" + title,
346-
sources=[url] if url else [],
347-
embedding=chunk_emb,
348-
created_at=datetime.now().isoformat(),
349-
usage=[],
350-
background=f"Xinyu search result from {site or source}",
351-
)
352-
memory_items.append(
353-
TextualMemoryItem(id=str(uuid.uuid4()), memory=memory_content, metadata=metadata)
331+
f"Content: {read_item_i.memory}"
354332
)
333+
read_item_i.metadata.source = "web"
334+
read_item_i.metadata.memory_type = "OuterMemory"
335+
read_item_i.metadata.sources = [url] if url else []
336+
read_item_i.metadata.visibility = "public"
355337

338+
memory_items.append(read_item_i)
356339
return memory_items
357-
358-
def _chunk(self, content: str) -> list[tuple[str, list[float], float]]:
359-
"""
360-
Use SentenceChunker to split content into chunks and embed each.
361-
362-
Returns:
363-
List of (chunk_text, chunk_embedding, dummy_score)
364-
"""
365-
chunks = self.chunker.chunk(content)
366-
if not chunks:
367-
return []
368-
369-
chunk_texts = [c.text for c in chunks]
370-
chunk_embeddings = self.embedder.embed(chunk_texts)
371-
372-
return [(text, emb, 1.0) for text, emb in zip(chunk_texts, chunk_embeddings, strict=False)]

0 commit comments

Comments
 (0)