11#!/usr/bin/env python3
22"""
3- Semantic chunking utilities using Chonkie and semchunk.
4- Implements two-level chunking strategy for optimal RAG performance.
3+ Semantic chunking utilities using semchunk.
4+ Simplified single-pass chunking for optimal RAG performance.
55"""
66
77import logging
1212logger = logging .getLogger (__name__ )
1313
1414# Chunking configuration from environment variables
15- # CHUNK_SIZE: Target chunk size in tokens (default: 400 )
15+ # CHUNK_SIZE: Target chunk size in tokens (default: 512 )
1616# CHUNK_MAX_TOKENS: Maximum chunk size before re-chunking (default: 1500, safe for nomic-embed-text 2048 limit)
17- CHUNK_SIZE = int (os .getenv ('CHUNK_SIZE' , '400 ' ))
17+ CHUNK_SIZE = int (os .getenv ('CHUNK_SIZE' , '512 ' ))
1818CHUNK_MAX_TOKENS = int (os .getenv ('CHUNK_MAX_TOKENS' , '1500' ))
1919
2020# Custom exception for chunking failures
@@ -23,22 +23,34 @@ class ChunkingError(RuntimeError):
2323 pass
2424
2525
26+ # Global tiktoken encoding cache for performance
27+ _TIKTOKEN_ENC = None
28+
29+
30+ def _get_tiktoken_encoding (encoding_name : str = "cl100k_base" ):
31+ """Get cached tiktoken encoding for performance."""
32+ global _TIKTOKEN_ENC
33+ if _TIKTOKEN_ENC is None :
34+ _TIKTOKEN_ENC = tiktoken .get_encoding (encoding_name )
35+ return _TIKTOKEN_ENC
36+
37+
2638def count_tokens (text : str , encoding_name : str = "cl100k_base" ) -> int :
2739 """
28- Count tokens in text using tiktoken.
29-
40+ Count tokens in text using tiktoken (cached) .
41+
3042 Args:
3143 text: Text to count tokens for
3244 encoding_name: Tiktoken encoding name
33-
45+
3446 Returns:
3547 Number of tokens
3648 """
3749 try :
38- enc = tiktoken . get_encoding (encoding_name )
50+ enc = _get_tiktoken_encoding (encoding_name )
3951 return len (enc .encode (text ))
4052 except Exception as e :
41- logger .warning (f"Token counting failed: { e } , using word‑ based fallback" )
53+ logger .warning (f"Token counting failed: { e } , using word- based fallback" )
4254 # Fallback: approximate 1 token ≈ 1 word
4355 return len (text .split ())
4456
@@ -179,6 +191,63 @@ def fine_chunk_text(
179191 raise ChunkingError (str (e ))
180192
181193
194+ def semchunk_text (
195+ text : str ,
196+ target_tokens : int = 512 ,
197+ overlap_tokens : int = 50
198+ ) -> list [dict ]:
199+ """
200+ Direct semantic chunking using semchunk.
201+
202+ Single-pass chunking that respects semantic boundaries.
203+ Simpler and faster than two-level chunking.
204+
205+ Args:
206+ text: Text to chunk
207+ target_tokens: Target size for chunks (tokens)
208+ overlap_tokens: Overlap between chunks (tokens)
209+
210+ Returns:
211+ List of chunk dictionaries with metadata
212+ """
213+ if not text or len (text .strip ()) == 0 :
214+ return []
215+
216+ try :
217+ from semchunk import chunkerify
218+
219+ # Create chunker with cached tiktoken encoding
220+ chunker = chunkerify ("cl100k_base" , chunk_size = target_tokens )
221+
222+ # Chunk the text directly
223+ chunk_texts = chunker (text , overlap = overlap_tokens )
224+
225+ # Build chunk dictionaries with metadata
226+ chunks = []
227+ for idx , chunk_text in enumerate (chunk_texts ):
228+ if not chunk_text or len (chunk_text .strip ()) == 0 :
229+ continue
230+
231+ token_count = count_tokens (chunk_text )
232+ chunks .append ({
233+ 'text' : chunk_text ,
234+ 'semantic_block_index' : 0 , # Single block for direct chunking
235+ 'chunk_index' : idx ,
236+ 'token_count' : token_count ,
237+ 'chunking_method' : 'semchunk'
238+ })
239+
240+ logger .info (f"Semchunk created { len (chunks )} chunks from text" )
241+ return chunks
242+
243+ except ImportError as e :
244+ logger .error ("Semchunk not installed, cannot perform chunking" )
245+ raise ChunkingError ("Semchunk not installed" )
246+ except Exception as e :
247+ logger .warning (f"Semchunk failed: { e } " )
248+ raise ChunkingError (str (e ))
249+
250+
182251def _fallback_chunk (
183252 blocks : list [str ],
184253 target_tokens : int ,
@@ -244,16 +313,15 @@ def create_chunks(
244313 max_tokens : int = None
245314) -> list [dict ]:
246315 """
247- Create chunks from text using two-level semantic chunking (chonkie + semchunk).
316+ Create chunks from text using semantic chunking (semchunk only ).
248317
249- Pipeline:
250- 1. Chonkie TokenChunker: creates macro-semantic blocks (2x target size)
251- 2. Semchunk: refines into fine-grained embedding-ready chunks
252- 3. Filter: removes too short/long chunks
318+ Simplified pipeline:
319+ 1. Semchunk: creates embedding-ready chunks respecting semantic boundaries
320+ 2. Filter: removes too short/long chunks
253321
254322 Args:
255323 text: Text to chunk
256- chunk_size: Target chunk size in tokens (default: CHUNK_SIZE env var or 400 )
324+ chunk_size: Target chunk size in tokens (default: CHUNK_SIZE env var or 512 )
257325 chunk_overlap: Overlap between chunks in tokens (default: 50)
258326 min_tokens: Minimum chunk size to keep (default: 0)
259327 max_tokens: Maximum chunk size before re-chunking (default: CHUNK_MAX_TOKENS env var or 1500)
@@ -271,32 +339,26 @@ def create_chunks(
271339 return []
272340
273341 try :
274- # Level 1: Chonkie semantic chunking (macro blocks)
275- macro_chunks = semantic_chunk_text (
342+ # Direct semchunk - no need for two-level chunking
343+ # Semchunk already handles semantic boundaries well
344+ chunks = semchunk_text (
276345 text ,
277- chunk_size = chunk_size * 2 , # Larger blocks first
278- chunk_overlap = chunk_overlap
279- )
280-
281- if not macro_chunks :
282- logger .warning ("No macro chunks created, using fallback" )
283- return _fallback_chunk ([text ], chunk_size , chunk_overlap )
284-
285- # Level 2: Semchunk fine-grained chunking
286- fine_chunks = fine_chunk_text (
287- macro_chunks ,
288346 target_tokens = chunk_size ,
289347 overlap_tokens = chunk_overlap
290348 )
291349
292- # Level 3: Filter and validate
350+ if not chunks :
351+ logger .warning ("No chunks created, using fallback" )
352+ return _fallback_chunk ([text ], chunk_size , chunk_overlap )
353+
354+ # Filter and validate
293355 valid_chunks = filter_chunks (
294- fine_chunks ,
356+ chunks ,
295357 min_tokens = min_tokens ,
296358 max_tokens = max_tokens
297359 )
298360
299- logger .info (f"Created { len (valid_chunks )} chunks (chonkie+ semchunk pipeline)" )
361+ logger .info (f"Created { len (valid_chunks )} chunks (semchunk pipeline)" )
300362 return valid_chunks
301363
302364 except ChunkingError as e :
@@ -319,7 +381,7 @@ def filter_chunks(
319381 chunks: List of chunk dictionaries
320382 min_tokens: Minimum token count - 0 = keep all (default)
321383 max_tokens: Maximum token count (default: CHUNK_MAX_TOKENS env var or 1500)
322-
384+
323385 Returns:
324386 Filtered list of valid chunks
325387 """
@@ -329,7 +391,11 @@ def filter_chunks(
329391 valid_chunks = []
330392
331393 for chunk in chunks :
332- token_count = chunk .get ('token_count' , count_tokens (chunk ['text' ]))
394+ # Use cached token_count if available, only count if missing
395+ token_count = chunk .get ('token_count' )
396+ if token_count is None :
397+ token_count = count_tokens (chunk ['text' ])
398+ chunk ['token_count' ] = token_count # Cache for later use
333399
334400 if token_count < min_tokens :
335401 logger .debug (f"Discarding too short chunk: { token_count } tokens" )
0 commit comments