Feat: add langchain markdown chunker (#574)

fridayL · web-flow · commit 69bad38b3d96 · 2025-12-02T17:30:51.000+08:00
* feat: update memos headers

* feat: headers add

* feat: update search agent

* feat: upadte mem story

* feat: update mem scehduler

* feat: update deepsearch mem code

* feat:  update deepsearch agent

* feat: update test code

* fix: remove dup config

* feat: dock search pipeline

* fix: code test

* feat: add test scripts

* feat: add test

* feat: update need_raw process

* fix: add initter

* fix: change agent search func name

* feat: update logs and defined

* feat:  update full text mem search

* feat: cp  plugin  to dev

* feat: add one recall for fulltext retrieval

* fix: set default for fulltext search

* feat: add langchain chunk
diff --git a/examples/mem_chunk/markdown_chunk.py b/examples/mem_chunk/markdown_chunk.py
@@ -0,0 +1,33 @@
+from memos.chunkers import ChunkerFactory
+from memos.configs.chunker import ChunkerConfigFactory
+
+
+config = ChunkerConfigFactory.model_validate(
+    {
+        "backend": "markdown",
+        "config": {
+            "chunk_size": 1000,
+            "chunk_overlap": 100,
+            "recursive": True,
+        },
+    }
+)
+
+chunker = ChunkerFactory.from_config(config)
+
+text = """
+# Header 1
+This is the first sentence. This is the second sentence.
+And here's a third one with some additional context.
+
+# Header 2
+This is the fourth sentence. This is the fifth sentence.
+And here's a sixth one with some additional context.
+
+# Header 3
+This is the seventh sentence. This is the eighth sentence.
+And here's a ninth one with some additional context.
+"""
+chunks = chunker.chunk(text)
+for chunk in chunks:
+    print("doc:", chunk)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -87,6 +87,7 @@ mem-user = [
 mem-reader = [
     "chonkie (>=1.0.7,<2.0.0)",  # Sentence chunking library
     "markitdown[docx,pdf,pptx,xls,xlsx] (>=0.1.1,<0.2.0)",  # Markdown parser for various file formats
+    "langchain-text-splitters (>=1.0.0,<2.0.0)", # markdown chunk for langchain
 ]
 
 # PreferenceTextMemory
@@ -105,6 +106,7 @@ all = [
     "pika (>=1.3.2,<2.0.0)",
     "pymysql (>=1.1.0,<2.0.0)",
     "chonkie (>=1.0.7,<2.0.0)",
+    "langchain-text-splitters (>=1.0.0,<2.0.0)",
     "markitdown[docx,pdf,pptx,xls,xlsx] (>=0.1.1,<0.2.0)",
     "pymilvus (>=2.6.1,<3.0.0)",
     "datasketch (>=1.6.5,<2.0.0)",
@@ -174,7 +176,6 @@ bert-score = "^0.3.13"
 scipy = "^1.10.1"
 python-dotenv = "^1.1.1"
 langgraph = "^0.5.1"
-langmem = "^0.0.27"
 
 
 [tool.poetry.group.mem-user.dependencies]
diff --git a/src/memos/chunkers/factory.py b/src/memos/chunkers/factory.py
@@ -3,6 +3,7 @@
 from memos.configs.chunker import ChunkerConfigFactory
 
 from .base import BaseChunker
+from .markdown_chunker import MarkdownChunker
 from .sentence_chunker import SentenceChunker
 
 
@@ -11,6 +12,7 @@ class ChunkerFactory:
 
     backend_to_class: ClassVar[dict[str, Any]] = {
         "sentence": SentenceChunker,
+        "markdown": MarkdownChunker,
     }
 
     @classmethod
diff --git a/src/memos/chunkers/markdown_chunker.py b/src/memos/chunkers/markdown_chunker.py
@@ -0,0 +1,53 @@
+from memos.configs.chunker import MarkdownChunkerConfig
+from memos.dependency import require_python_package
+from memos.log import get_logger
+
+from .base import BaseChunker, Chunk
+
+
+logger = get_logger(__name__)
+
+
+class MarkdownChunker(BaseChunker):
+    """Markdown-based text chunker."""
+
+    @require_python_package(
+        import_name="langchain_text_splitters",
+        install_command="pip install langchain_text_splitters==1.0.0",
+        install_link="https://github.com/langchain-ai/langchain-text-splitters",
+    )
+    def __init__(self, config: MarkdownChunkerConfig):
+        from langchain_text_splitters import (
+            MarkdownHeaderTextSplitter,
+            RecursiveCharacterTextSplitter,
+        )
+
+        self.config = config
+        self.chunker = MarkdownHeaderTextSplitter(
+            headers_to_split_on=config.headers_to_split_on,
+            strip_headers=config.strip_headers,
+        )
+        self.chunker_recursive = None
+        logger.info(f"Initialized MarkdownHeaderTextSplitter with config: {config}")
+        if config.recursive:
+            self.chunker_recursive = RecursiveCharacterTextSplitter(
+                chunk_size=config.chunk_size,
+                chunk_overlap=config.chunk_overlap,
+            )
+
+    def chunk(self, text: str) -> list[str] | list[Chunk]:
+        """Chunk the given text into smaller chunks based on sentences."""
+        md_header_splits = self.chunker.split_text(text)
+        chunks = []
+        if self.chunker_recursive:
+            md_header_splits = self.chunker_recursive.split_documents(md_header_splits)
+        for doc in md_header_splits:
+            try:
+                chunk = " ".join(list(doc.metadata.values())) + "\n" + doc.page_content
+                chunks.append(chunk)
+            except Exception as e:
+                logger.warning(f"warning chunking document: {e}")
+                chunks.append(doc.page_content)
+
+        logger.debug(f"Generated {len(chunks)} chunks from input text")
+        return chunks
diff --git a/src/memos/configs/chunker.py b/src/memos/configs/chunker.py
@@ -20,6 +20,19 @@ class SentenceChunkerConfig(BaseChunkerConfig):
     """Configuration for sentence-based text chunker."""
 
 
+class MarkdownChunkerConfig(BaseChunkerConfig):
+    """Configuration for markdown-based text chunker."""
+
+    headers_to_split_on: list[tuple[str, str]] = Field(
+        default=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")],
+        description="Headers to split on",
+    )
+    strip_headers: bool = Field(default=True, description="Strip headers from the text")
+    recursive: bool = Field(
+        default=False, description="Whether to use recursive character text splitter"
+    )
+
+
 class ChunkerConfigFactory(BaseConfig):
     """Factory class for creating chunker configurations."""
 
@@ -28,6 +41,7 @@ class ChunkerConfigFactory(BaseConfig):
 
     backend_to_class: ClassVar[dict[str, Any]] = {
         "sentence": SentenceChunkerConfig,
+        "markdown": MarkdownChunkerConfig,
     }
 
     @field_validator("backend")
diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py b/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py
@@ -507,7 +507,10 @@ def _retrieve_simple(
         user_name: str | None = None,
         **kwargs,
     ):
-        """Retrieve from by keywords and embedding"""
+        """
+        Retrieve from by keywords and embedding, this func is hotfix for sources=plugin mode
+        will merge with fulltext retrieval in the future
+        """
         query_words = []
         if self.tokenizer:
             query_words = self.tokenizer.tokenize_mixed(query)