Skip to content

Commit 98fa2b5

Browse files
fridayLCaralHsi
andauthored
Feat: reorgnaize chunk code and use markdown chunker (#618)
* feat: update memos headers * feat: headers add * feat: update search agent * feat: upadte mem story * feat: update mem scehduler * feat: update deepsearch mem code * feat: update deepsearch agent * feat: update test code * fix: remove dup config * feat: dock search pipeline * fix: code test * feat: add test scripts * feat: add test * feat: update need_raw process * fix: add initter * fix: change agent search func name * feat: update logs and defined * feat: update full text mem search * feat: cp plugin to dev * feat: add one recall for fulltext retrieval * fix: set default for fulltext search * feat: add langchain chunk * feat: fix playground for query * feat: update file content memory extract * feat: update code * feat: update import * code: reformat suffix * feat: update file_id * remove langchain-text-splitters==1.0.0 * feat: add reqiuement * feat: make test * feat: fix markdown * feat: fix simple chunker --------- Co-authored-by: CaralHsi <[email protected]>
1 parent 111b4d4 commit 98fa2b5

File tree

6 files changed

+131
-108
lines changed

6 files changed

+131
-108
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from memos.configs.chunker import MarkdownChunkerConfig
2+
from memos.dependency import require_python_package
3+
from memos.log import get_logger
4+
5+
from .base import BaseChunker, Chunk
6+
7+
8+
logger = get_logger(__name__)
9+
10+
11+
class CharacterTextChunker(BaseChunker):
12+
"""Character-based text chunker."""
13+
14+
@require_python_package(
15+
import_name="langchain_text_splitters",
16+
install_command="pip install langchain_text_splitters==1.0.0",
17+
install_link="https://github.com/langchain-ai/langchain-text-splitters",
18+
)
19+
def __init__(
20+
self,
21+
config: MarkdownChunkerConfig | None = None,
22+
chunk_size: int = 1000,
23+
chunk_overlap: int = 200,
24+
):
25+
from langchain_text_splitters import (
26+
RecursiveCharacterTextSplitter,
27+
)
28+
29+
self.config = config
30+
self.chunker = RecursiveCharacterTextSplitter(
31+
chunk_size=config.chunk_size if config else chunk_size,
32+
chunk_overlap=config.chunk_overlap if config else chunk_overlap,
33+
length_function=len,
34+
separators=["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " ", ""],
35+
)
36+
37+
def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
38+
"""Chunk the given text into smaller chunks based on sentences."""
39+
chunks = self.chunker.split_text(text)
40+
logger.debug(f"Generated {len(chunks)} chunks from input text")
41+
return chunks

src/memos/chunkers/markdown_chunker.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,35 @@ class MarkdownChunker(BaseChunker):
1616
install_command="pip install langchain_text_splitters==1.0.0",
1717
install_link="https://github.com/langchain-ai/langchain-text-splitters",
1818
)
19-
def __init__(self, config: MarkdownChunkerConfig):
19+
def __init__(
20+
self,
21+
config: MarkdownChunkerConfig | None = None,
22+
chunk_size: int = 1000,
23+
chunk_overlap: int = 200,
24+
recursive: bool = False,
25+
):
2026
from langchain_text_splitters import (
2127
MarkdownHeaderTextSplitter,
2228
RecursiveCharacterTextSplitter,
2329
)
2430

2531
self.config = config
2632
self.chunker = MarkdownHeaderTextSplitter(
27-
headers_to_split_on=config.headers_to_split_on,
28-
strip_headers=config.strip_headers,
33+
headers_to_split_on=config.headers_to_split_on
34+
if config
35+
else [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")],
36+
strip_headers=config.strip_headers if config else False,
2937
)
3038
self.chunker_recursive = None
3139
logger.info(f"Initialized MarkdownHeaderTextSplitter with config: {config}")
32-
if config.recursive:
40+
if (config and config.recursive) or recursive:
3341
self.chunker_recursive = RecursiveCharacterTextSplitter(
34-
chunk_size=config.chunk_size,
35-
chunk_overlap=config.chunk_overlap,
42+
chunk_size=config.chunk_size if config else chunk_size,
43+
chunk_overlap=config.chunk_overlap if config else chunk_overlap,
44+
length_function=len,
3645
)
3746

38-
def chunk(self, text: str) -> list[str] | list[Chunk]:
47+
def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
3948
"""Chunk the given text into smaller chunks based on sentences."""
4049
md_header_splits = self.chunker.split_text(text)
4150
chunks = []
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
class SimpleTextSplitter:
2+
"""Simple text splitter wrapper."""
3+
4+
def __init__(self, chunk_size: int, chunk_overlap: int):
5+
self.chunk_size = chunk_size
6+
self.chunk_overlap = chunk_overlap
7+
8+
def chunk(self, text: str, **kwargs) -> list[str]:
9+
return self._simple_split_text(text, self.chunk_size, self.chunk_overlap)
10+
11+
def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) -> list[str]:
12+
"""
13+
Simple text splitter as fallback when langchain is not available.
14+
15+
Args:
16+
text: Text to split
17+
chunk_size: Maximum size of chunks
18+
chunk_overlap: Overlap between chunks
19+
20+
Returns:
21+
List of text chunks
22+
"""
23+
if not text or len(text) <= chunk_size:
24+
return [text] if text.strip() else []
25+
26+
chunks = []
27+
start = 0
28+
text_len = len(text)
29+
30+
while start < text_len:
31+
# Calculate end position
32+
end = min(start + chunk_size, text_len)
33+
34+
# If not the last chunk, try to break at a good position
35+
if end < text_len:
36+
# Try to break at newline, sentence end, or space
37+
for separator in ["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " "]:
38+
last_sep = text.rfind(separator, start, end)
39+
if last_sep != -1:
40+
end = last_sep + len(separator)
41+
break
42+
43+
chunk = text[start:end].strip()
44+
if chunk:
45+
chunks.append(chunk)
46+
47+
# Move start position with overlap
48+
start = max(start + 1, end - chunk_overlap)
49+
50+
return chunks

src/memos/mem_reader/read_multi_modal/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ def parse(
226226
else:
227227
raise ValueError(f"Unknown mode: {mode}. Must be 'fast' or 'fine'")
228228

229-
def _split_text(self, text: str) -> list[str]:
229+
def _split_text(self, text: str, is_markdown: bool = False) -> list[str]:
230230
"""
231231
Split text into chunks using text splitter from utils.
232232
@@ -245,7 +245,7 @@ def _split_text(self, text: str) -> list[str]:
245245
return [text] if text.strip() else []
246246

247247
try:
248-
chunks = splitter.split_text(text)
248+
chunks = splitter.chunk(text)
249249
logger.debug(f"[FileContentParser] Split text into {len(chunks)} chunks")
250250
return chunks
251251
except Exception as e:

src/memos/mem_reader/read_multi_modal/file_content_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,7 @@ def parse_fine(
506506
memory_type = "LongTermMemory"
507507

508508
# Split parsed text into chunks
509-
content_chunks = self._split_text(parsed_text)
509+
content_chunks = self._split_text(parsed_text, is_markdown)
510510

511511
# Filter out empty chunks and create indexed list
512512
valid_chunks = [

src/memos/mem_reader/read_multi_modal/utils.py

Lines changed: 21 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -111,48 +111,6 @@ def _cheap_close(t: str) -> str:
111111
DEFAULT_CHUNK_OVERLAP = int(os.getenv("FILE_PARSER_CHUNK_OVERLAP", "200"))
112112

113113

114-
def _simple_split_text(text: str, chunk_size: int, chunk_overlap: int) -> list[str]:
115-
"""
116-
Simple text splitter as fallback when langchain is not available.
117-
118-
Args:
119-
text: Text to split
120-
chunk_size: Maximum size of chunks
121-
chunk_overlap: Overlap between chunks
122-
123-
Returns:
124-
List of text chunks
125-
"""
126-
if not text or len(text) <= chunk_size:
127-
return [text] if text.strip() else []
128-
129-
chunks = []
130-
start = 0
131-
text_len = len(text)
132-
133-
while start < text_len:
134-
# Calculate end position
135-
end = min(start + chunk_size, text_len)
136-
137-
# If not the last chunk, try to break at a good position
138-
if end < text_len:
139-
# Try to break at newline, sentence end, or space
140-
for separator in ["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " "]:
141-
last_sep = text.rfind(separator, start, end)
142-
if last_sep != -1:
143-
end = last_sep + len(separator)
144-
break
145-
146-
chunk = text[start:end].strip()
147-
if chunk:
148-
chunks.append(chunk)
149-
150-
# Move start position with overlap
151-
start = max(start + 1, end - chunk_overlap)
152-
153-
return chunks
154-
155-
156114
# Initialize parser instance
157115
file_parser = None
158116
try:
@@ -163,51 +121,27 @@ def _simple_split_text(text: str, chunk_size: int, chunk_overlap: int) -> list[s
163121
logger.error(f"[FileContentParser] Failed to create parser: {e}")
164122
file_parser = None
165123

166-
# Initialize text splitter instance
167-
text_splitter = None
168-
_use_simple_splitter = False
124+
markdown_text_splitter = None
169125

170126
try:
171-
try:
172-
from langchain.text_splitter import RecursiveCharacterTextSplitter
173-
except ImportError:
174-
try:
175-
from langchain_text_splitters import (
176-
MarkdownHeaderTextSplitter,
177-
RecursiveCharacterTextSplitter,
178-
)
179-
except ImportError:
180-
logger.error(
181-
"langchain not available. Install with: pip install langchain or pip install langchain-text-splitters"
182-
)
183-
184-
text_splitter = RecursiveCharacterTextSplitter(
185-
chunk_size=DEFAULT_CHUNK_SIZE,
186-
chunk_overlap=DEFAULT_CHUNK_OVERLAP,
187-
length_function=len,
188-
separators=["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " ", ""],
189-
)
190-
markdown_text_splitter = MarkdownHeaderTextSplitter(
191-
headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")],
192-
strip_headers=False,
193-
)
194-
logger.debug(
195-
f"[FileContentParser] Initialized langchain text splitter with chunk_size={DEFAULT_CHUNK_SIZE}, "
196-
f"chunk_overlap={DEFAULT_CHUNK_OVERLAP}"
127+
from memos.chunkers.charactertext_chunker import CharacterTextChunker
128+
from memos.chunkers.markdown_chunker import MarkdownChunker
129+
130+
markdown_text_splitter = MarkdownChunker(
131+
chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP, recursive=True
197132
)
198-
except ImportError as e:
199-
logger.warning(
200-
f"[FileContentParser] langchain not available, using simple text splitter as fallback: {e}. "
201-
"Install with: pip install langchain or pip install langchain-text-splitters"
133+
text_splitter = CharacterTextChunker(
134+
chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
202135
)
203-
text_splitter = None
204-
_use_simple_splitter = True
136+
logger.info("[FileContentParser] Initialized text splitter instances by lancga")
205137
except Exception as e:
206-
logger.error(
207-
f"[FileContentParser] Failed to initialize text splitter: {e}, using simple splitter as fallback"
138+
logger.warning(
139+
f"[FileContentParser] Failed to create text splitter: {e} will use simple splitter fallback"
208140
)
141+
from memos.chunkers.simple_chunker import SimpleTextSplitter
142+
143+
markdown_text_splitter = None
209144
text_splitter = None
210-
_use_simple_splitter = True
211145

212146

213147
def get_parser() -> Any:
@@ -220,7 +154,9 @@ def get_parser() -> Any:
220154
return file_parser
221155

222156

223-
def get_text_splitter(chunk_size: int | None = None, chunk_overlap: int | None = None) -> Any:
157+
def get_text_splitter(
158+
chunk_size: int | None = None, chunk_overlap: int | None = None, is_markdown: bool = False
159+
) -> Any:
224160
"""
225161
Get text splitter instance or a callable that uses simple splitter.
226162
@@ -231,28 +167,15 @@ def get_text_splitter(chunk_size: int | None = None, chunk_overlap: int | None =
231167
Returns:
232168
Text splitter instance (RecursiveCharacterTextSplitter) or a callable wrapper for simple splitter
233169
"""
234-
if text_splitter is not None:
170+
if is_markdown and markdown_text_splitter is not None:
171+
return markdown_text_splitter
172+
elif text_splitter is not None:
235173
return text_splitter
236-
237-
# Return a callable wrapper that uses simple splitter
238-
if _use_simple_splitter:
174+
else:
239175
actual_chunk_size = chunk_size or DEFAULT_CHUNK_SIZE
240176
actual_chunk_overlap = chunk_overlap or DEFAULT_CHUNK_OVERLAP
241-
242-
class SimpleTextSplitter:
243-
"""Simple text splitter wrapper."""
244-
245-
def __init__(self, chunk_size: int, chunk_overlap: int):
246-
self.chunk_size = chunk_size
247-
self.chunk_overlap = chunk_overlap
248-
249-
def split_text(self, text: str) -> list[str]:
250-
return _simple_split_text(text, self.chunk_size, self.chunk_overlap)
251-
252177
return SimpleTextSplitter(actual_chunk_size, actual_chunk_overlap)
253178

254-
return None
255-
256179

257180
def extract_role(message: dict[str, Any]) -> str:
258181
"""Extract role from message."""

0 commit comments

Comments
 (0)