Skip to content

Commit 4cef2c3

Browse files
committed
feat: add default spliter
1 parent c955fd0 commit 4cef2c3

File tree

2 files changed

+109
-12
lines changed

2 files changed

+109
-12
lines changed

src/memos/mem_reader/read_multi_modal/file_content_parser.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from memos.types.openai_chat_completion_types import File
1717

1818
from .base import BaseMessageParser, _derive_key
19-
from .utils import file_parser, text_splitter
19+
from .utils import get_parser, get_text_splitter
2020

2121

2222
logger = get_logger(__name__)
@@ -110,7 +110,7 @@ def __init__(
110110

111111
def _split_text(self, text: str) -> list[str]:
112112
"""
113-
Split text into chunks using langchain text splitter from utils.
113+
Split text into chunks using text splitter from utils.
114114
115115
Args:
116116
text: Text to split
@@ -121,12 +121,13 @@ def _split_text(self, text: str) -> list[str]:
121121
if not text or not text.strip():
122122
return []
123123

124-
if not text_splitter:
124+
splitter = get_text_splitter()
125+
if not splitter:
125126
# If text splitter is not available, return text as single chunk
126127
return [text] if text.strip() else []
127128

128129
try:
129-
chunks = text_splitter.split_text(text)
130+
chunks = splitter.split_text(text)
130131
logger.debug(f"[FileContentParser] Split text into {len(chunks)} chunks")
131132
return chunks
132133
except Exception as e:
@@ -178,7 +179,8 @@ def _parse_file(self, file_info: dict[str, Any]) -> str:
178179
Returns:
179180
Parsed text content
180181
"""
181-
if not file_parser:
182+
parser = self.parser or get_parser()
183+
if not parser:
182184
logger.warning("[FileContentParser] Parser not available")
183185
return ""
184186

@@ -191,7 +193,7 @@ def _parse_file(self, file_info: dict[str, Any]) -> str:
191193

192194
try:
193195
if os.path.exists(file_path):
194-
parsed_text = file_parser.parse(file_path)
196+
parsed_text = parser.parse(file_path)
195197
return parsed_text
196198
else:
197199
logger.warning(f"[FileContentParser] File not found: {file_path}")
@@ -375,7 +377,10 @@ def parse_fine(
375377
file_data = file_info.get("file_data", "")
376378
file_id = file_info.get("file_id", "")
377379
filename = file_info.get("filename", "")
378-
if not file_parser:
380+
381+
# Use parser from utils
382+
parser = self.parser or get_parser()
383+
if not parser:
379384
logger.warning("[FileContentParser] Parser not available")
380385
return []
381386

@@ -392,8 +397,7 @@ def parse_fine(
392397
parsed_text, temp_file_path = self._handle_url(url_str, filename)
393398
if temp_file_path:
394399
try:
395-
# Use parser from utils (singleton)
396-
parser = self.parser or file_parser
400+
# Use parser from utils
397401
if parser:
398402
parsed_text = parser.parse(temp_file_path)
399403
else:

src/memos/mem_reader/read_multi_modal/utils.py

Lines changed: 96 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,49 @@
5252
DEFAULT_CHUNK_SIZE = int(os.getenv("FILE_PARSER_CHUNK_SIZE", "1000"))
5353
DEFAULT_CHUNK_OVERLAP = int(os.getenv("FILE_PARSER_CHUNK_OVERLAP", "200"))
5454

55+
56+
def _simple_split_text(text: str, chunk_size: int, chunk_overlap: int) -> list[str]:
57+
"""
58+
Simple text splitter as fallback when langchain is not available.
59+
60+
Args:
61+
text: Text to split
62+
chunk_size: Maximum size of chunks
63+
chunk_overlap: Overlap between chunks
64+
65+
Returns:
66+
List of text chunks
67+
"""
68+
if not text or len(text) <= chunk_size:
69+
return [text] if text.strip() else []
70+
71+
chunks = []
72+
start = 0
73+
text_len = len(text)
74+
75+
while start < text_len:
76+
# Calculate end position
77+
end = min(start + chunk_size, text_len)
78+
79+
# If not the last chunk, try to break at a good position
80+
if end < text_len:
81+
# Try to break at newline, sentence end, or space
82+
for separator in ["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " "]:
83+
last_sep = text.rfind(separator, start, end)
84+
if last_sep != -1:
85+
end = last_sep + len(separator)
86+
break
87+
88+
chunk = text[start:end].strip()
89+
if chunk:
90+
chunks.append(chunk)
91+
92+
# Move start position with overlap
93+
start = max(start + 1, end - chunk_overlap)
94+
95+
return chunks
96+
97+
5598
# Initialize parser instance
5699
file_parser = None
57100
try:
@@ -64,6 +107,8 @@
64107

65108
# Initialize text splitter instance
66109
text_splitter = None
110+
_use_simple_splitter = False
111+
67112
try:
68113
try:
69114
from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -82,18 +127,66 @@
82127
separators=["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " ", ""],
83128
)
84129
logger.debug(
85-
f"[FileContentParser] Initialized text splitter with chunk_size={DEFAULT_CHUNK_SIZE}, "
130+
f"[FileContentParser] Initialized langchain text splitter with chunk_size={DEFAULT_CHUNK_SIZE}, "
86131
f"chunk_overlap={DEFAULT_CHUNK_OVERLAP}"
87132
)
88133
except ImportError as e:
89134
logger.warning(
90-
f"[FileContentParser] langchain not available, text splitting will be disabled: {e}. "
135+
f"[FileContentParser] langchain not available, using simple text splitter as fallback: {e}. "
91136
"Install with: pip install langchain or pip install langchain-text-splitters"
92137
)
93138
text_splitter = None
139+
_use_simple_splitter = True
94140
except Exception as e:
95-
logger.error(f"[FileContentParser] Failed to initialize text splitter: {e}")
141+
logger.error(
142+
f"[FileContentParser] Failed to initialize text splitter: {e}, using simple splitter as fallback"
143+
)
96144
text_splitter = None
145+
_use_simple_splitter = True
146+
147+
148+
def get_parser() -> Any:
149+
"""
150+
Get parser instance.
151+
152+
Returns:
153+
Parser instance (from ParserFactory) or None if not available
154+
"""
155+
return file_parser
156+
157+
158+
def get_text_splitter(chunk_size: int | None = None, chunk_overlap: int | None = None) -> Any:
159+
"""
160+
Get text splitter instance or a callable that uses simple splitter.
161+
162+
Args:
163+
chunk_size: Maximum size of chunks when splitting text (used for simple splitter fallback)
164+
chunk_overlap: Overlap between chunks when splitting text (used for simple splitter fallback)
165+
166+
Returns:
167+
Text splitter instance (RecursiveCharacterTextSplitter) or a callable wrapper for simple splitter
168+
"""
169+
if text_splitter is not None:
170+
return text_splitter
171+
172+
# Return a callable wrapper that uses simple splitter
173+
if _use_simple_splitter:
174+
actual_chunk_size = chunk_size or DEFAULT_CHUNK_SIZE
175+
actual_chunk_overlap = chunk_overlap or DEFAULT_CHUNK_OVERLAP
176+
177+
class SimpleTextSplitter:
178+
"""Simple text splitter wrapper."""
179+
180+
def __init__(self, chunk_size: int, chunk_overlap: int):
181+
self.chunk_size = chunk_size
182+
self.chunk_overlap = chunk_overlap
183+
184+
def split_text(self, text: str) -> list[str]:
185+
return _simple_split_text(text, self.chunk_size, self.chunk_overlap)
186+
187+
return SimpleTextSplitter(actual_chunk_size, actual_chunk_overlap)
188+
189+
return None
97190

98191

99192
def extract_role(message: dict[str, Any]) -> str:

0 commit comments

Comments
 (0)