33"""
44from typing import List
55from .tokenizer import num_tokens_calculus # Import the new tokenizing function
6+ from langchain_core .language_models .chat_models import BaseChatModel
67
7- def split_text_into_chunks (text : str , chunk_size : int ) -> List [str ]:
8+ def split_text_into_chunks (text : str , chunk_size : int , model : BaseChatModel , use_semchunk = True ) -> List [str ]:
89 """
910 Splits the text into chunks based on the number of tokens.
1011
@@ -15,26 +16,43 @@ def split_text_into_chunks(text: str, chunk_size: int) -> List[str]:
1516 Returns:
1617 List[str]: A list of text chunks.
1718 """
18- tokens = num_tokens_calculus (text )
19- if tokens <= chunk_size :
20- return [text ]
21-
22- chunks = []
23- current_chunk = []
24- current_length = 0
25-
26- words = text .split ()
27- for word in words :
28- word_tokens = num_tokens_calculus (word )
29- if current_length + word_tokens > chunk_size :
30- chunks .append (' ' .join (current_chunk ))
31- current_chunk = [word ]
32- current_length = word_tokens
33- else :
34- current_chunk .append (word )
35- current_length += word_tokens
3619
37- if current_chunk :
38- chunks .append (' ' .join (current_chunk ))
20+ if use_semchunk :
21+ from semchunk import chunk
22+ def count_tokens (text ):
23+ return num_tokens_calculus (text , model )
24+
25+ chunk_size = min (chunk_size - 500 , int (chunk_size * 0.9 ))
26+
27+ chunks = chunk (text = text ,
28+ chunk_size = chunk_size ,
29+ token_counter = count_tokens ,
30+ memoize = False )
31+ return chunks
32+
33+ else :
34+
35+ tokens = num_tokens_calculus (text , model )
36+
37+ if tokens <= chunk_size :
38+ return [text ]
39+
40+ chunks = []
41+ current_chunk = []
42+ current_length = 0
43+
44+ words = text .split ()
45+ for word in words :
46+ word_tokens = num_tokens_calculus (word , model )
47+ if current_length + word_tokens > chunk_size :
48+ chunks .append (' ' .join (current_chunk ))
49+ current_chunk = [word ]
50+ current_length = word_tokens
51+ else :
52+ current_chunk .append (word )
53+ current_length += word_tokens
54+
55+ if current_chunk :
56+ chunks .append (' ' .join (current_chunk ))
3957
40- return chunks
58+ return chunks
0 commit comments