Skip to content

Commit 69bad38

Browse files
authored
Feat: add langchain markdown chunker (#574)
* feat: update memos headers * feat: headers add * feat: update search agent * feat: upadte mem story * feat: update mem scehduler * feat: update deepsearch mem code * feat: update deepsearch agent * feat: update test code * fix: remove dup config * feat: dock search pipeline * fix: code test * feat: add test scripts * feat: add test * feat: update need_raw process * fix: add initter * fix: change agent search func name * feat: update logs and defined * feat: update full text mem search * feat: cp plugin to dev * feat: add one recall for fulltext retrieval * fix: set default for fulltext search * feat: add langchain chunk
1 parent ac9af5f commit 69bad38

File tree

7 files changed

+164
-270
lines changed

7 files changed

+164
-270
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from memos.chunkers import ChunkerFactory
2+
from memos.configs.chunker import ChunkerConfigFactory
3+
4+
5+
config = ChunkerConfigFactory.model_validate(
6+
{
7+
"backend": "markdown",
8+
"config": {
9+
"chunk_size": 1000,
10+
"chunk_overlap": 100,
11+
"recursive": True,
12+
},
13+
}
14+
)
15+
16+
chunker = ChunkerFactory.from_config(config)
17+
18+
text = """
19+
# Header 1
20+
This is the first sentence. This is the second sentence.
21+
And here's a third one with some additional context.
22+
23+
# Header 2
24+
This is the fourth sentence. This is the fifth sentence.
25+
And here's a sixth one with some additional context.
26+
27+
# Header 3
28+
This is the seventh sentence. This is the eighth sentence.
29+
And here's a ninth one with some additional context.
30+
"""
31+
chunks = chunker.chunk(text)
32+
for chunk in chunks:
33+
print("doc:", chunk)

poetry.lock

Lines changed: 56 additions & 268 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ mem-user = [
8787
mem-reader = [
8888
"chonkie (>=1.0.7,<2.0.0)", # Sentence chunking library
8989
"markitdown[docx,pdf,pptx,xls,xlsx] (>=0.1.1,<0.2.0)", # Markdown parser for various file formats
90+
"langchain-text-splitters (>=1.0.0,<2.0.0)", # markdown chunk for langchain
9091
]
9192

9293
# PreferenceTextMemory
@@ -105,6 +106,7 @@ all = [
105106
"pika (>=1.3.2,<2.0.0)",
106107
"pymysql (>=1.1.0,<2.0.0)",
107108
"chonkie (>=1.0.7,<2.0.0)",
109+
"langchain-text-splitters (>=1.0.0,<2.0.0)",
108110
"markitdown[docx,pdf,pptx,xls,xlsx] (>=0.1.1,<0.2.0)",
109111
"pymilvus (>=2.6.1,<3.0.0)",
110112
"datasketch (>=1.6.5,<2.0.0)",
@@ -174,7 +176,6 @@ bert-score = "^0.3.13"
174176
scipy = "^1.10.1"
175177
python-dotenv = "^1.1.1"
176178
langgraph = "^0.5.1"
177-
langmem = "^0.0.27"
178179

179180

180181
[tool.poetry.group.mem-user.dependencies]

src/memos/chunkers/factory.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from memos.configs.chunker import ChunkerConfigFactory
44

55
from .base import BaseChunker
6+
from .markdown_chunker import MarkdownChunker
67
from .sentence_chunker import SentenceChunker
78

89

@@ -11,6 +12,7 @@ class ChunkerFactory:
1112

1213
backend_to_class: ClassVar[dict[str, Any]] = {
1314
"sentence": SentenceChunker,
15+
"markdown": MarkdownChunker,
1416
}
1517

1618
@classmethod
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from memos.configs.chunker import MarkdownChunkerConfig
2+
from memos.dependency import require_python_package
3+
from memos.log import get_logger
4+
5+
from .base import BaseChunker, Chunk
6+
7+
8+
logger = get_logger(__name__)
9+
10+
11+
class MarkdownChunker(BaseChunker):
12+
"""Markdown-based text chunker."""
13+
14+
@require_python_package(
15+
import_name="langchain_text_splitters",
16+
install_command="pip install langchain_text_splitters==1.0.0",
17+
install_link="https://github.com/langchain-ai/langchain-text-splitters",
18+
)
19+
def __init__(self, config: MarkdownChunkerConfig):
20+
from langchain_text_splitters import (
21+
MarkdownHeaderTextSplitter,
22+
RecursiveCharacterTextSplitter,
23+
)
24+
25+
self.config = config
26+
self.chunker = MarkdownHeaderTextSplitter(
27+
headers_to_split_on=config.headers_to_split_on,
28+
strip_headers=config.strip_headers,
29+
)
30+
self.chunker_recursive = None
31+
logger.info(f"Initialized MarkdownHeaderTextSplitter with config: {config}")
32+
if config.recursive:
33+
self.chunker_recursive = RecursiveCharacterTextSplitter(
34+
chunk_size=config.chunk_size,
35+
chunk_overlap=config.chunk_overlap,
36+
)
37+
38+
def chunk(self, text: str) -> list[str] | list[Chunk]:
39+
"""Chunk the given text into smaller chunks based on sentences."""
40+
md_header_splits = self.chunker.split_text(text)
41+
chunks = []
42+
if self.chunker_recursive:
43+
md_header_splits = self.chunker_recursive.split_documents(md_header_splits)
44+
for doc in md_header_splits:
45+
try:
46+
chunk = " ".join(list(doc.metadata.values())) + "\n" + doc.page_content
47+
chunks.append(chunk)
48+
except Exception as e:
49+
logger.warning(f"warning chunking document: {e}")
50+
chunks.append(doc.page_content)
51+
52+
logger.debug(f"Generated {len(chunks)} chunks from input text")
53+
return chunks

src/memos/configs/chunker.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,19 @@ class SentenceChunkerConfig(BaseChunkerConfig):
2020
"""Configuration for sentence-based text chunker."""
2121

2222

23+
class MarkdownChunkerConfig(BaseChunkerConfig):
24+
"""Configuration for markdown-based text chunker."""
25+
26+
headers_to_split_on: list[tuple[str, str]] = Field(
27+
default=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")],
28+
description="Headers to split on",
29+
)
30+
strip_headers: bool = Field(default=True, description="Strip headers from the text")
31+
recursive: bool = Field(
32+
default=False, description="Whether to use recursive character text splitter"
33+
)
34+
35+
2336
class ChunkerConfigFactory(BaseConfig):
2437
"""Factory class for creating chunker configurations."""
2538

@@ -28,6 +41,7 @@ class ChunkerConfigFactory(BaseConfig):
2841

2942
backend_to_class: ClassVar[dict[str, Any]] = {
3043
"sentence": SentenceChunkerConfig,
44+
"markdown": MarkdownChunkerConfig,
3145
}
3246

3347
@field_validator("backend")

src/memos/memories/textual/tree_text_memory/retrieve/searcher.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,10 @@ def _retrieve_simple(
507507
user_name: str | None = None,
508508
**kwargs,
509509
):
510-
"""Retrieve from by keywords and embedding"""
510+
"""
511+
Retrieve from by keywords and embedding, this func is hotfix for sources=plugin mode
512+
will merge with fulltext retrieval in the future
513+
"""
511514
query_words = []
512515
if self.tokenizer:
513516
query_words = self.tokenizer.tokenize_mixed(query)

0 commit comments

Comments
 (0)