1313 TextualMemoryItem ,
1414 TreeNodeTextualMemoryMetadata ,
1515)
16- from memos .parsers .factory import ParserFactory
1716from memos .types .openai_chat_completion_types import File
1817
1918from .base import BaseMessageParser , _derive_key
19+ from .utils import file_parser , text_splitter
2020
2121
2222logger = get_logger (__name__ )
@@ -108,6 +108,32 @@ def __init__(
108108 else :
109109 self .direct_markdown_hostnames = []
110110
111+ def _split_text (self , text : str ) -> list [str ]:
112+ """
113+ Split text into chunks using langchain text splitter from utils.
114+
115+ Args:
116+ text: Text to split
117+
118+ Returns:
119+ List of text chunks
120+ """
121+ if not text or not text .strip ():
122+ return []
123+
124+ if not text_splitter :
125+ # If text splitter is not available, return text as single chunk
126+ return [text ] if text .strip () else []
127+
128+ try :
129+ chunks = text_splitter .split_text (text )
130+ logger .debug (f"[FileContentParser] Split text into { len (chunks )} chunks" )
131+ return chunks
132+ except Exception as e :
133+ logger .error (f"[FileContentParser] Error splitting text: { e } " )
134+ # Fallback to single chunk
135+ return [text ] if text .strip () else []
136+
111137 def create_source (
112138 self ,
113139 message : File ,
@@ -152,21 +178,9 @@ def _parse_file(self, file_info: dict[str, Any]) -> str:
152178 Returns:
153179 Parsed text content
154180 """
155- if not self .parser :
156- # Try to create a default parser
157- try :
158- from memos .configs .parser import ParserConfigFactory
159-
160- parser_config = ParserConfigFactory .model_validate (
161- {
162- "backend" : "markitdown" ,
163- "config" : {},
164- }
165- )
166- self .parser = ParserFactory .from_config (parser_config )
167- except Exception as e :
168- logger .warning (f"[FileContentParser] Failed to create parser: { e } " )
169- return ""
181+ if not file_parser :
182+ logger .warning ("[FileContentParser] Parser not available" )
183+ return ""
170184
171185 file_path = file_info .get ("path" ) or file_info .get ("file_id" , "" )
172186 filename = file_info .get ("filename" , "unknown" )
@@ -177,7 +191,7 @@ def _parse_file(self, file_info: dict[str, Any]) -> str:
177191
178192 try :
179193 if os .path .exists (file_path ):
180- parsed_text = self . parser .parse (file_path )
194+ parsed_text = file_parser .parse (file_path )
181195 return parsed_text
182196 else :
183197 logger .warning (f"[FileContentParser] File not found: { file_path } " )
@@ -264,6 +278,9 @@ def parse_fast(
264278 # Combine content parts
265279 content = " " .join (content_parts )
266280
281+ # Split content into chunks
282+ content_chunks = self ._split_text (content )
283+
267284 # Create source
268285 source = self .create_source (message , info )
269286
@@ -276,27 +293,59 @@ def parse_fast(
276293 # (since we don't have role information at this level)
277294 memory_type = "LongTermMemory"
278295
279- # Create memory item
280- memory_item = TextualMemoryItem (
281- memory = content ,
282- metadata = TreeNodeTextualMemoryMetadata (
283- user_id = user_id ,
284- session_id = session_id ,
285- memory_type = memory_type ,
286- status = "activated" ,
287- tags = ["mode:fast" , "multimodal:file" ],
288- key = _derive_key (content ),
289- embedding = self .embedder .embed ([content ])[0 ],
290- usage = [],
291- sources = [source ],
292- background = "" ,
293- confidence = 0.99 ,
294- type = "fact" ,
295- info = info_ ,
296- ),
297- )
296+ # Create memory items for each chunk
297+ memory_items = []
298+ for chunk_idx , chunk_text in enumerate (content_chunks ):
299+ if not chunk_text .strip ():
300+ continue
301+
302+ memory_item = TextualMemoryItem (
303+ memory = chunk_text ,
304+ metadata = TreeNodeTextualMemoryMetadata (
305+ user_id = user_id ,
306+ session_id = session_id ,
307+ memory_type = memory_type ,
308+ status = "activated" ,
309+ tags = [
310+ "mode:fast" ,
311+ "multimodal:file" ,
312+ f"chunk:{ chunk_idx + 1 } /{ len (content_chunks )} " ,
313+ ],
314+ key = _derive_key (chunk_text ),
315+ embedding = self .embedder .embed ([chunk_text ])[0 ],
316+ usage = [],
317+ sources = [source ],
318+ background = "" ,
319+ confidence = 0.99 ,
320+ type = "fact" ,
321+ info = info_ ,
322+ ),
323+ )
324+ memory_items .append (memory_item )
325+
326+ # If no chunks were created, create a placeholder
327+ if not memory_items :
328+ memory_item = TextualMemoryItem (
329+ memory = content ,
330+ metadata = TreeNodeTextualMemoryMetadata (
331+ user_id = user_id ,
332+ session_id = session_id ,
333+ memory_type = memory_type ,
334+ status = "activated" ,
335+ tags = ["mode:fast" , "multimodal:file" ],
336+ key = _derive_key (content ),
337+ embedding = self .embedder .embed ([content ])[0 ],
338+ usage = [],
339+ sources = [source ],
340+ background = "" ,
341+ confidence = 0.99 ,
342+ type = "fact" ,
343+ info = info_ ,
344+ ),
345+ )
346+ memory_items .append (memory_item )
298347
299- return [ memory_item ]
348+ return memory_items
300349
301350 def parse_fine (
302351 self ,
@@ -326,22 +375,9 @@ def parse_fine(
326375 file_data = file_info .get ("file_data" , "" )
327376 file_id = file_info .get ("file_id" , "" )
328377 filename = file_info .get ("filename" , "" )
329-
330- # Initialize parser if not already set
331- if not self .parser :
332- try :
333- from memos .configs .parser import ParserConfigFactory
334-
335- parser_config = ParserConfigFactory .model_validate (
336- {
337- "backend" : "markitdown" ,
338- "config" : {},
339- }
340- )
341- self .parser = ParserFactory .from_config (parser_config )
342- except Exception as e :
343- logger .warning (f"[FileContentParser] Failed to create parser: { e } " )
344- return []
378+ if not file_parser :
379+ logger .warning ("[FileContentParser] Parser not available" )
380+ return []
345381
346382 parsed_text = ""
347383 temp_file_path = None
@@ -356,7 +392,12 @@ def parse_fine(
356392 parsed_text , temp_file_path = self ._handle_url (url_str , filename )
357393 if temp_file_path :
358394 try :
359- parsed_text = self .parser .parse (temp_file_path )
395+ # Use parser from utils (singleton)
396+ parser = self .parser or file_parser
397+ if parser :
398+ parsed_text = parser .parse (temp_file_path )
399+ else :
400+ parsed_text = "[File parsing error: Parser not available]"
360401 except Exception as e :
361402 logger .error (
362403 f"[FileContentParser] Error parsing downloaded file: { e } "
@@ -411,24 +452,59 @@ def parse_fine(
411452 # For file content parts, default to LongTermMemory
412453 memory_type = "LongTermMemory"
413454
414- # Create memory item with parsed content
415- memory_item = TextualMemoryItem (
416- memory = parsed_text ,
417- metadata = TreeNodeTextualMemoryMetadata (
418- user_id = user_id ,
419- session_id = session_id ,
420- memory_type = memory_type ,
421- status = "activated" ,
422- tags = ["mode:fine" , "multimodal:file" ],
423- key = _derive_key (parsed_text ),
424- embedding = self .embedder .embed ([parsed_text ])[0 ],
425- usage = [],
426- sources = [source ],
427- background = "" ,
428- confidence = 0.99 ,
429- type = "fact" ,
430- info = info_ ,
431- ),
432- )
455+ # Split parsed text into chunks
456+ content_chunks = self ._split_text (parsed_text )
457+
458+ # Create memory items for each chunk
459+ memory_items = []
460+ for chunk_idx , chunk_text in enumerate (content_chunks ):
461+ if not chunk_text .strip ():
462+ continue
463+
464+ memory_item = TextualMemoryItem (
465+ memory = chunk_text ,
466+ metadata = TreeNodeTextualMemoryMetadata (
467+ user_id = user_id ,
468+ session_id = session_id ,
469+ memory_type = memory_type ,
470+ status = "activated" ,
471+ tags = [
472+ "mode:fine" ,
473+ "multimodal:file" ,
474+ f"chunk:{ chunk_idx + 1 } /{ len (content_chunks )} " ,
475+ ],
476+ key = _derive_key (chunk_text ),
477+ embedding = self .embedder .embed ([chunk_text ])[0 ],
478+ usage = [],
479+ sources = [source ],
480+ background = "" ,
481+ confidence = 0.99 ,
482+ type = "fact" ,
483+ info = info_ ,
484+ ),
485+ )
486+ memory_items .append (memory_item )
487+
488+ # If no chunks were created, create a placeholder
489+ if not memory_items :
490+ memory_item = TextualMemoryItem (
491+ memory = parsed_text ,
492+ metadata = TreeNodeTextualMemoryMetadata (
493+ user_id = user_id ,
494+ session_id = session_id ,
495+ memory_type = memory_type ,
496+ status = "activated" ,
497+ tags = ["mode:fine" , "multimodal:file" ],
498+ key = _derive_key (parsed_text ),
499+ embedding = self .embedder .embed ([parsed_text ])[0 ],
500+ usage = [],
501+ sources = [source ],
502+ background = "" ,
503+ confidence = 0.99 ,
504+ type = "fact" ,
505+ info = info_ ,
506+ ),
507+ )
508+ memory_items .append (memory_item )
433509
434- return [ memory_item ]
510+ return memory_items
0 commit comments