forked from MemTensor/MemOS
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsentence_chunker.py
More file actions
54 lines (42 loc) · 1.95 KB
/
sentence_chunker.py
File metadata and controls
54 lines (42 loc) · 1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from memos.configs.chunker import SentenceChunkerConfig
from memos.dependency import require_python_package
from memos.log import get_logger
from .base import BaseChunker, Chunk
logger = get_logger(__name__)
class SentenceChunker(BaseChunker):
"""Sentence-based text chunker."""
@require_python_package(
import_name="chonkie",
install_command="pip install chonkie",
install_link="https://docs.chonkie.ai/python-sdk/getting-started/installation",
)
def __init__(self, config: SentenceChunkerConfig):
from chonkie import SentenceChunker as ChonkieSentenceChunker
self.config = config
# Try new API first (v1.4.0+)
try:
self.chunker = ChonkieSentenceChunker(
tokenizer=config.tokenizer_or_token_counter,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
min_sentences_per_chunk=config.min_sentences_per_chunk,
)
except (TypeError, AttributeError) as e:
# Fallback to old API (<v1.4.0)
logger.debug(f"Falling back to old chonkie API: {e}")
self.chunker = ChonkieSentenceChunker(
tokenizer_or_token_counter=config.tokenizer_or_token_counter,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
min_sentences_per_chunk=config.min_sentences_per_chunk,
)
logger.info(f"Initialized SentenceChunker with config: {config}")
def chunk(self, text: str) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
chonkie_chunks = self.chunker.chunk(text)
chunks = []
for c in chonkie_chunks:
chunk = Chunk(text=c.text, token_count=c.token_count, sentences=c.sentences)
chunks.append(chunk)
logger.debug(f"Generated {len(chunks)} chunks from input text")
return chunks