Skip to content

Commit c955fd0

Browse files
committed
feat: add text spliter and parser
1 parent ff4dcdc commit c955fd0

File tree

2 files changed

+202
-74
lines changed

2 files changed

+202
-74
lines changed

src/memos/mem_reader/read_multi_modal/file_content_parser.py

Lines changed: 150 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
TextualMemoryItem,
1414
TreeNodeTextualMemoryMetadata,
1515
)
16-
from memos.parsers.factory import ParserFactory
1716
from memos.types.openai_chat_completion_types import File
1817

1918
from .base import BaseMessageParser, _derive_key
19+
from .utils import file_parser, text_splitter
2020

2121

2222
logger = get_logger(__name__)
@@ -108,6 +108,32 @@ def __init__(
108108
else:
109109
self.direct_markdown_hostnames = []
110110

111+
def _split_text(self, text: str) -> list[str]:
112+
"""
113+
Split text into chunks using langchain text splitter from utils.
114+
115+
Args:
116+
text: Text to split
117+
118+
Returns:
119+
List of text chunks
120+
"""
121+
if not text or not text.strip():
122+
return []
123+
124+
if not text_splitter:
125+
# If text splitter is not available, return text as single chunk
126+
return [text] if text.strip() else []
127+
128+
try:
129+
chunks = text_splitter.split_text(text)
130+
logger.debug(f"[FileContentParser] Split text into {len(chunks)} chunks")
131+
return chunks
132+
except Exception as e:
133+
logger.error(f"[FileContentParser] Error splitting text: {e}")
134+
# Fallback to single chunk
135+
return [text] if text.strip() else []
136+
111137
def create_source(
112138
self,
113139
message: File,
@@ -152,21 +178,9 @@ def _parse_file(self, file_info: dict[str, Any]) -> str:
152178
Returns:
153179
Parsed text content
154180
"""
155-
if not self.parser:
156-
# Try to create a default parser
157-
try:
158-
from memos.configs.parser import ParserConfigFactory
159-
160-
parser_config = ParserConfigFactory.model_validate(
161-
{
162-
"backend": "markitdown",
163-
"config": {},
164-
}
165-
)
166-
self.parser = ParserFactory.from_config(parser_config)
167-
except Exception as e:
168-
logger.warning(f"[FileContentParser] Failed to create parser: {e}")
169-
return ""
181+
if not file_parser:
182+
logger.warning("[FileContentParser] Parser not available")
183+
return ""
170184

171185
file_path = file_info.get("path") or file_info.get("file_id", "")
172186
filename = file_info.get("filename", "unknown")
@@ -177,7 +191,7 @@ def _parse_file(self, file_info: dict[str, Any]) -> str:
177191

178192
try:
179193
if os.path.exists(file_path):
180-
parsed_text = self.parser.parse(file_path)
194+
parsed_text = file_parser.parse(file_path)
181195
return parsed_text
182196
else:
183197
logger.warning(f"[FileContentParser] File not found: {file_path}")
@@ -264,6 +278,9 @@ def parse_fast(
264278
# Combine content parts
265279
content = " ".join(content_parts)
266280

281+
# Split content into chunks
282+
content_chunks = self._split_text(content)
283+
267284
# Create source
268285
source = self.create_source(message, info)
269286

@@ -276,27 +293,59 @@ def parse_fast(
276293
# (since we don't have role information at this level)
277294
memory_type = "LongTermMemory"
278295

279-
# Create memory item
280-
memory_item = TextualMemoryItem(
281-
memory=content,
282-
metadata=TreeNodeTextualMemoryMetadata(
283-
user_id=user_id,
284-
session_id=session_id,
285-
memory_type=memory_type,
286-
status="activated",
287-
tags=["mode:fast", "multimodal:file"],
288-
key=_derive_key(content),
289-
embedding=self.embedder.embed([content])[0],
290-
usage=[],
291-
sources=[source],
292-
background="",
293-
confidence=0.99,
294-
type="fact",
295-
info=info_,
296-
),
297-
)
296+
# Create memory items for each chunk
297+
memory_items = []
298+
for chunk_idx, chunk_text in enumerate(content_chunks):
299+
if not chunk_text.strip():
300+
continue
301+
302+
memory_item = TextualMemoryItem(
303+
memory=chunk_text,
304+
metadata=TreeNodeTextualMemoryMetadata(
305+
user_id=user_id,
306+
session_id=session_id,
307+
memory_type=memory_type,
308+
status="activated",
309+
tags=[
310+
"mode:fast",
311+
"multimodal:file",
312+
f"chunk:{chunk_idx + 1}/{len(content_chunks)}",
313+
],
314+
key=_derive_key(chunk_text),
315+
embedding=self.embedder.embed([chunk_text])[0],
316+
usage=[],
317+
sources=[source],
318+
background="",
319+
confidence=0.99,
320+
type="fact",
321+
info=info_,
322+
),
323+
)
324+
memory_items.append(memory_item)
325+
326+
# If no chunks were created, create a placeholder
327+
if not memory_items:
328+
memory_item = TextualMemoryItem(
329+
memory=content,
330+
metadata=TreeNodeTextualMemoryMetadata(
331+
user_id=user_id,
332+
session_id=session_id,
333+
memory_type=memory_type,
334+
status="activated",
335+
tags=["mode:fast", "multimodal:file"],
336+
key=_derive_key(content),
337+
embedding=self.embedder.embed([content])[0],
338+
usage=[],
339+
sources=[source],
340+
background="",
341+
confidence=0.99,
342+
type="fact",
343+
info=info_,
344+
),
345+
)
346+
memory_items.append(memory_item)
298347

299-
return [memory_item]
348+
return memory_items
300349

301350
def parse_fine(
302351
self,
@@ -326,22 +375,9 @@ def parse_fine(
326375
file_data = file_info.get("file_data", "")
327376
file_id = file_info.get("file_id", "")
328377
filename = file_info.get("filename", "")
329-
330-
# Initialize parser if not already set
331-
if not self.parser:
332-
try:
333-
from memos.configs.parser import ParserConfigFactory
334-
335-
parser_config = ParserConfigFactory.model_validate(
336-
{
337-
"backend": "markitdown",
338-
"config": {},
339-
}
340-
)
341-
self.parser = ParserFactory.from_config(parser_config)
342-
except Exception as e:
343-
logger.warning(f"[FileContentParser] Failed to create parser: {e}")
344-
return []
378+
if not file_parser:
379+
logger.warning("[FileContentParser] Parser not available")
380+
return []
345381

346382
parsed_text = ""
347383
temp_file_path = None
@@ -356,7 +392,12 @@ def parse_fine(
356392
parsed_text, temp_file_path = self._handle_url(url_str, filename)
357393
if temp_file_path:
358394
try:
359-
parsed_text = self.parser.parse(temp_file_path)
395+
# Use parser from utils (singleton)
396+
parser = self.parser or file_parser
397+
if parser:
398+
parsed_text = parser.parse(temp_file_path)
399+
else:
400+
parsed_text = "[File parsing error: Parser not available]"
360401
except Exception as e:
361402
logger.error(
362403
f"[FileContentParser] Error parsing downloaded file: {e}"
@@ -411,24 +452,59 @@ def parse_fine(
411452
# For file content parts, default to LongTermMemory
412453
memory_type = "LongTermMemory"
413454

414-
# Create memory item with parsed content
415-
memory_item = TextualMemoryItem(
416-
memory=parsed_text,
417-
metadata=TreeNodeTextualMemoryMetadata(
418-
user_id=user_id,
419-
session_id=session_id,
420-
memory_type=memory_type,
421-
status="activated",
422-
tags=["mode:fine", "multimodal:file"],
423-
key=_derive_key(parsed_text),
424-
embedding=self.embedder.embed([parsed_text])[0],
425-
usage=[],
426-
sources=[source],
427-
background="",
428-
confidence=0.99,
429-
type="fact",
430-
info=info_,
431-
),
432-
)
455+
# Split parsed text into chunks
456+
content_chunks = self._split_text(parsed_text)
457+
458+
# Create memory items for each chunk
459+
memory_items = []
460+
for chunk_idx, chunk_text in enumerate(content_chunks):
461+
if not chunk_text.strip():
462+
continue
463+
464+
memory_item = TextualMemoryItem(
465+
memory=chunk_text,
466+
metadata=TreeNodeTextualMemoryMetadata(
467+
user_id=user_id,
468+
session_id=session_id,
469+
memory_type=memory_type,
470+
status="activated",
471+
tags=[
472+
"mode:fine",
473+
"multimodal:file",
474+
f"chunk:{chunk_idx + 1}/{len(content_chunks)}",
475+
],
476+
key=_derive_key(chunk_text),
477+
embedding=self.embedder.embed([chunk_text])[0],
478+
usage=[],
479+
sources=[source],
480+
background="",
481+
confidence=0.99,
482+
type="fact",
483+
info=info_,
484+
),
485+
)
486+
memory_items.append(memory_item)
487+
488+
# If no chunks were created, create a placeholder
489+
if not memory_items:
490+
memory_item = TextualMemoryItem(
491+
memory=parsed_text,
492+
metadata=TreeNodeTextualMemoryMetadata(
493+
user_id=user_id,
494+
session_id=session_id,
495+
memory_type=memory_type,
496+
status="activated",
497+
tags=["mode:fine", "multimodal:file"],
498+
key=_derive_key(parsed_text),
499+
embedding=self.embedder.embed([parsed_text])[0],
500+
usage=[],
501+
sources=[source],
502+
background="",
503+
confidence=0.99,
504+
type="fact",
505+
info=info_,
506+
),
507+
)
508+
memory_items.append(memory_item)
433509

434-
return [memory_item]
510+
return memory_items

src/memos/mem_reader/read_multi_modal/utils.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,58 @@
4343
re.I,
4444
)
4545

46+
# Default configuration for parser and text splitter
47+
DEFAULT_PARSER_CONFIG = {
48+
"backend": "markitdown",
49+
"config": {},
50+
}
51+
52+
DEFAULT_CHUNK_SIZE = int(os.getenv("FILE_PARSER_CHUNK_SIZE", "1000"))
53+
DEFAULT_CHUNK_OVERLAP = int(os.getenv("FILE_PARSER_CHUNK_OVERLAP", "200"))
54+
55+
# Initialize parser instance
56+
file_parser = None
57+
try:
58+
parser_config = ParserConfigFactory.model_validate(DEFAULT_PARSER_CONFIG)
59+
file_parser = ParserFactory.from_config(parser_config)
60+
logger.debug("[FileContentParser] Initialized parser instance")
61+
except Exception as e:
62+
logger.error(f"[FileContentParser] Failed to create parser: {e}")
63+
file_parser = None
64+
65+
# Initialize text splitter instance
66+
text_splitter = None
67+
try:
68+
try:
69+
from langchain.text_splitter import RecursiveCharacterTextSplitter
70+
except ImportError:
71+
try:
72+
from langchain_text_splitters import RecursiveCharacterTextSplitter
73+
except ImportError:
74+
logger.error(
75+
"langchain not available. Install with: pip install langchain or pip install langchain-text-splitters"
76+
)
77+
78+
text_splitter = RecursiveCharacterTextSplitter(
79+
chunk_size=DEFAULT_CHUNK_SIZE,
80+
chunk_overlap=DEFAULT_CHUNK_OVERLAP,
81+
length_function=len,
82+
separators=["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " ", ""],
83+
)
84+
logger.debug(
85+
f"[FileContentParser] Initialized text splitter with chunk_size={DEFAULT_CHUNK_SIZE}, "
86+
f"chunk_overlap={DEFAULT_CHUNK_OVERLAP}"
87+
)
88+
except ImportError as e:
89+
logger.warning(
90+
f"[FileContentParser] langchain not available, text splitting will be disabled: {e}. "
91+
"Install with: pip install langchain or pip install langchain-text-splitters"
92+
)
93+
text_splitter = None
94+
except Exception as e:
95+
logger.error(f"[FileContentParser] Failed to initialize text splitter: {e}")
96+
text_splitter = None
97+
4698

4799
def extract_role(message: dict[str, Any]) -> str:
48100
"""Extract role from message."""

0 commit comments

Comments
 (0)