55"""
66
77import logging
8+ import os
89from typing import Optional , TypedDict , List
910import tiktoken
1011
1112logger = logging .getLogger (__name__ )
1213
14+ # Chunking configuration from environment variables
15+ # CHUNK_SIZE: Target chunk size in tokens (default: 400)
16+ # CHUNK_MAX_TOKENS: Maximum chunk size before re-chunking (default: 1500, safe for nomic-embed-text 2048 limit)
17+ CHUNK_SIZE = int (os .getenv ('CHUNK_SIZE' , '400' ))
18+ CHUNK_MAX_TOKENS = int (os .getenv ('CHUNK_MAX_TOKENS' , '1500' ))
19+
1320# Custom exception for chunking failures
1421class ChunkingError (RuntimeError ):
1522 """Raised when semantic chunking cannot be performed."""
@@ -212,27 +219,29 @@ def _fallback_chunk(
212219 return final_chunks
213220
214221
215- def validate_chunk_size (chunk_text : str , max_tokens : int = 2048 ) -> bool :
222+ def validate_chunk_size (chunk_text : str , max_tokens : int = None ) -> bool :
216223 """
217224 Validate that chunk doesn't exceed embedding model's token limit.
218225
219226 Args:
220227 chunk_text: Chunk text to validate
221- max_tokens: Maximum allowed tokens (nomic-embed-text: 2048 )
228+ max_tokens: Maximum allowed tokens (default: CHUNK_MAX_TOKENS env var or 1500 )
222229
223230 Returns:
224231 True if chunk is within limits
225232 """
233+ if max_tokens is None :
234+ max_tokens = CHUNK_MAX_TOKENS
226235 token_count = count_tokens (chunk_text )
227236 return token_count <= max_tokens
228237
229238
230239def create_chunks (
231240 text : str ,
232- chunk_size : int = 500 ,
241+ chunk_size : int = None ,
233242 chunk_overlap : int = 50 ,
234243 min_tokens : int = 0 ,
235- max_tokens : int = 2048
244+ max_tokens : int = None
236245) -> list [dict ]:
237246 """
238247 Create chunks from text using two-level semantic chunking (chonkie + semchunk).
@@ -244,14 +253,20 @@ def create_chunks(
244253
245254 Args:
246255 text: Text to chunk
247- chunk_size: Target chunk size in tokens (default: 500 )
256+ chunk_size: Target chunk size in tokens (default: CHUNK_SIZE env var or 400 )
248257 chunk_overlap: Overlap between chunks in tokens (default: 50)
249- min_tokens: Minimum chunk size to keep (default: 50 )
250- max_tokens: Maximum chunk size before re-chunking (default: 2048 )
258+ min_tokens: Minimum chunk size to keep (default: 0 )
259+ max_tokens: Maximum chunk size before re-chunking (default: CHUNK_MAX_TOKENS env var or 1500 )
251260
252261 Returns:
253262 List of chunk dictionaries with text and metadata
254263 """
264+ # Apply defaults from environment variables
265+ if chunk_size is None :
266+ chunk_size = CHUNK_SIZE
267+ if max_tokens is None :
268+ max_tokens = CHUNK_MAX_TOKENS
269+
255270 if not text or len (text .strip ()) == 0 :
256271 return []
257272
@@ -295,21 +310,24 @@ def create_chunks(
295310def filter_chunks (
296311 chunks : list [dict ],
297312 min_tokens : int = 0 ,
298- max_tokens : int = 8192
313+ max_tokens : int = None
299314) -> list [dict ]:
300315 """
301316 Filter chunks by token count.
302317
303318 Args:
304319 chunks: List of chunk dictionaries
305320 min_tokens: Minimum token count - 0 = keep all (default)
306- max_tokens: Maximum token count (needs re-chunking )
321+ max_tokens: Maximum token count (default: CHUNK_MAX_TOKENS env var or 1500 )
307322
308323 Returns:
309324 Filtered list of valid chunks
310325 """
326+ if max_tokens is None :
327+ max_tokens = CHUNK_MAX_TOKENS
328+
311329 valid_chunks = []
312-
330+
313331 for chunk in chunks :
314332 token_count = chunk .get ('token_count' , count_tokens (chunk ['text' ]))
315333
0 commit comments