88
99import requests
1010
11- from memos .chunkers .base import BaseChunker
1211from memos .embedders .factory import OllamaEmbedder
1312from memos .log import get_logger
14- from memos .memories .textual .item import TextualMemoryItem , TreeNodeTextualMemoryMetadata
13+ from memos .mem_reader .base import BaseMemReader
14+ from memos .memories .textual .item import TextualMemoryItem
1515
1616
1717logger = get_logger (__name__ )
@@ -114,7 +114,7 @@ def __init__(
114114 access_key : str ,
115115 search_engine_id : str ,
116116 embedder : OllamaEmbedder ,
117- chunker : BaseChunker ,
117+ reader : BaseMemReader ,
118118 max_results : int = 20 ,
119119 ):
120120 """
@@ -124,13 +124,14 @@ def __init__(
124124 access_key: Xinyu API access key
125125 embedder: Embedder instance for generating embeddings
126126 max_results: Maximum number of results to retrieve
127+ reader: MemReader Moduel to deal with internet contents
127128 """
128129 self .xinyu_api = XinyuSearchAPI (access_key , search_engine_id , max_results = max_results )
129130 self .embedder = embedder
130- self .chunker = chunker
131+ self .reader = reader
131132
132133 def retrieve_from_internet (
133- self , query : str , top_k : int = 10 , parsed_goal = None
134+ self , query : str , top_k : int = 10 , parsed_goal = None , info = None
134135 ) -> list [TextualMemoryItem ]:
135136 """
136137 Retrieve information from Xinyu search and convert to TextualMemoryItem format
@@ -139,7 +140,7 @@ def retrieve_from_internet(
139140 query: Search query
140141 top_k: Number of results to return
141142 parsed_goal: Parsed task goal (optional)
142-
143+ info (dict): Leave a record of memory consumption.
143144 Returns:
144145 List of TextualMemoryItem
145146 """
@@ -151,7 +152,7 @@ def retrieve_from_internet(
151152
152153 with ThreadPoolExecutor (max_workers = 8 ) as executor :
153154 futures = [
154- executor .submit (self ._process_result , result , query , parsed_goal )
155+ executor .submit (self ._process_result , result , query , parsed_goal , info )
155156 for result in search_results
156157 ]
157158 for future in as_completed (futures ):
@@ -301,7 +302,7 @@ def _extract_tags(self, title: str, content: str, summary: str, parsed_goal=None
301302 return list (set (tags ))[:15 ] # Limit to 15 tags
302303
303304 def _process_result (
304- self , result : dict , query : str , parsed_goal : str
305+ self , result : dict , query : str , parsed_goal : str , info : dict
305306 ) -> list [TextualMemoryItem ]:
306307 title = result .get ("title" , "" )
307308 content = result .get ("content" , "" )
@@ -318,55 +319,19 @@ def _process_result(
318319 publish_time = datetime .now ().strftime ("%Y-%m-%d" )
319320 else :
320321 publish_time = datetime .now ().strftime ("%Y-%m-%d" )
321- source = result .get ("source" , "" )
322- site = result .get ("site" , "" )
323- if site :
324- site = site .split ("|" )[0 ]
325322
326- qualified_chunks = self ._chunk ( content )
323+ read_items = self .reader . get_memory ([ content ], type = "doc" , info = info )
327324
328325 memory_items = []
329- for chunk_text , chunk_emb , score in qualified_chunks :
330- memory_content = (
326+ for read_item_i in read_items [ 0 ] :
327+ read_item_i . memory = (
331328 f"Title: { title } \n NewsTime: { publish_time } \n Summary: { summary } \n "
332- f"Content: { chunk_text } \n Source: { url } "
333- )
334- metadata = TreeNodeTextualMemoryMetadata (
335- user_id = None ,
336- session_id = None ,
337- status = "activated" ,
338- type = "fact" ,
339- source = "web" ,
340- confidence = score ,
341- entities = self ._extract_entities (title , content , summary ),
342- tags = self ._extract_tags (title , content , summary , parsed_goal ),
343- visibility = "public" ,
344- memory_type = "OuterMemory" ,
345- key = f"[{ source } ]" + title ,
346- sources = [url ] if url else [],
347- embedding = chunk_emb ,
348- created_at = datetime .now ().isoformat (),
349- usage = [],
350- background = f"Xinyu search result from { site or source } " ,
351- )
352- memory_items .append (
353- TextualMemoryItem (id = str (uuid .uuid4 ()), memory = memory_content , metadata = metadata )
329+ f"Content: { read_item_i .memory } "
354330 )
331+ read_item_i .metadata .source = "web"
332+ read_item_i .metadata .memory_type = "OuterMemory"
333+ read_item_i .metadata .sources = [url ] if url else []
334+ read_item_i .metadata .visibility = "public"
355335
336+ memory_items .append (read_item_i )
356337 return memory_items
357-
358- def _chunk (self , content : str ) -> list [tuple [str , list [float ], float ]]:
359- """
360- Use SentenceChunker to split content into chunks and embed each.
361-
362- Returns:
363- List of (chunk_text, chunk_embedding, dummy_score)
364- """
365- chunks = self .chunker .chunk (content )
366- if not chunks :
367- return []
368-
369- chunk_texts = [c .text for c in chunks ]
370- chunk_embeddings = self .embedder .embed (chunk_texts )
371-
372- return [(text , emb , 1.0 ) for text , emb in zip (chunk_texts , chunk_embeddings , strict = False )]
0 commit comments