77
88
99class Chunker :
10+ ESTIMATE_CHARS_PER_TOKEN = 4
11+
1012 def __init__ (self , conn : sqlite3 .Connection , settings : Settings ):
1113 self ._conn = conn
1214 self ._settings = settings
1315
1416 def chunk (self , text : str , metadata : dict = {}) -> list [Chunk ]:
1517 """Chunk text using Recursive Character Text Splitter."""
1618 chunks = []
19+
1720 if self ._get_token_count (text ) <= self ._settings .chunk_size :
1821 chunks = [Chunk (content = text )]
1922 else :
@@ -25,13 +28,19 @@ def _get_token_count(self, text: str) -> int:
2528 """Get token count using SQLite AI extension."""
2629 if text == "" :
2730 return 0
31+
32+ # Fallback to estimated token count for very large texts
33+ # to avoid performance issues
34+ if len (text ) > self ._settings .chunk_size * self .ESTIMATE_CHARS_PER_TOKEN * 2 :
35+ return self ._estimate_tokens_count (text )
36+
2837 cursor = self ._conn .execute ("SELECT llm_token_count(?) AS count" , (text ,))
2938 return cursor .fetchone ()["count" ]
3039
3140 def _estimate_tokens_count (self , text : str ) -> int :
3241 """Estimate token count more conservatively."""
3342 # This is a simple heuristic; adjust as needed
34- return (len (text ) + 3 ) // 4
43+ return (len (text ) + 3 ) // self . ESTIMATE_CHARS_PER_TOKEN
3544
3645 def _recursive_split (self , text : str ) -> List [Chunk ]:
3746 """Recursively split text into chunks with overlap."""
@@ -119,7 +128,7 @@ def _split_by_characters(self, text: str) -> List[Chunk]:
119128 chars_per_token = (
120129 math .ceil (len (text ) / total_tokens )
121130 if total_tokens > 0
122- else 4 # Assume 4 chars per token if no tokens found
131+ else self . ESTIMATE_CHARS_PER_TOKEN # Assume chars per token if no tokens found
123132 )
124133
125134 # Estimate characters that fit the chunk size
0 commit comments