Skip to content

Commit 60ab049

Browse files
perf(pipeline): ottimizza embedding pipeline con chunking semplificato, batch sizes aumentati e cache LRU per query
Rimosso Chonkie per chunking single-pass con Semchunk, aggiunta cache tiktoken globale, eliminato token count ridondante, batch embedding 32 e qdrant 100, embedding model configurabile via EMBEDDING_MODEL env, cache LRU per query embeddings
1 parent 6cdd79e commit 60ab049

File tree

7 files changed

+149
-110
lines changed

7 files changed

+149
-110
lines changed

api/routes/search.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import os
44
import warnings
5+
from functools import lru_cache
56
from typing import Optional
67

78
from fastapi import APIRouter, HTTPException
@@ -17,6 +18,7 @@
1718
QDRANT_URL = os.getenv('QDRANT_URL', 'http://localhost:6333')
1819
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
1920
OLLAMA_URL = os.getenv('OLLAMA_URL', 'http://localhost:11434')
21+
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'nomic-embed-text')
2022

2123

2224
def get_qdrant_client() -> QdrantClient:
@@ -26,22 +28,34 @@ def get_qdrant_client() -> QdrantClient:
2628
return QdrantClient(url=QDRANT_URL)
2729

2830

29-
def get_embedding(text: str, model: str = "nomic-embed-text", timeout: int = 30) -> list[float] | None:
30-
"""Generate embedding using Ollama."""
31+
@lru_cache(maxsize=1000)
32+
def _cached_embedding(text: str, model: str) -> tuple[float, ...] | None:
33+
"""Generate embedding with LRU cache (returns tuple for hashability)."""
3134
import requests
3235

3336
try:
3437
response = requests.post(
3538
f"{OLLAMA_URL}/api/embeddings",
3639
json={"model": model, "prompt": text},
37-
timeout=timeout
40+
timeout=30
3841
)
3942
response.raise_for_status()
40-
return response.json().get("embedding")
43+
embedding = response.json().get("embedding")
44+
return tuple(embedding) if embedding else None
4145
except Exception:
4246
return None
4347

4448

49+
def get_embedding(text: str, model: str = None, timeout: int = 30) -> list[float] | None:
50+
"""Generate embedding using Ollama with caching."""
51+
if model is None:
52+
model = EMBEDDING_MODEL
53+
54+
# Use cached version (returns tuple, convert back to list)
55+
result = _cached_embedding(text, model)
56+
return list(result) if result else None
57+
58+
4559
class SearchRequest(BaseModel):
4660
"""Search request body."""
4761
query: str

config.yaml

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,20 @@ extraction:
99
max_file_size: 104857600 # 100MB
1010

1111
chunking:
12-
strategies:
13-
markdown: semantic
14-
code: syntax_aware
15-
pdf: page_aware
16-
default: semantic
12+
# Single-pass semantic chunking with semchunk
1713
chunk_size: 512
1814
overlap: 50
1915
max_tokens: 2048
2016

2117
embedding:
2218
provider: ollama
23-
model: nomic-embed-text
24-
batch_size: 10
19+
model: nomic-embed-text # Configurable via EMBEDDING_MODEL env
20+
batch_size: 32 # Increased from 10 for better throughput
2521
# url: http://localhost:11434 # Defaults from env OLLAMA_URL
2622

2723
qdrant:
28-
collection: documentation # Changed to match existing data
29-
batch_size: 10
24+
collection: documentation
25+
batch_size: 100 # Increased from 10 for better throughput
3026
# url: http://localhost:6333 # Defaults from env QDRANT_URL
3127
# api_key: null # Defaults from env QDRANT_API_KEY
3228

lib/chunking.py

Lines changed: 99 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python3
22
"""
3-
Semantic chunking utilities using Chonkie and semchunk.
4-
Implements two-level chunking strategy for optimal RAG performance.
3+
Semantic chunking utilities using semchunk.
4+
Simplified single-pass chunking for optimal RAG performance.
55
"""
66

77
import logging
@@ -12,9 +12,9 @@
1212
logger = logging.getLogger(__name__)
1313

1414
# Chunking configuration from environment variables
15-
# CHUNK_SIZE: Target chunk size in tokens (default: 400)
15+
# CHUNK_SIZE: Target chunk size in tokens (default: 512)
1616
# CHUNK_MAX_TOKENS: Maximum chunk size before re-chunking (default: 1500, safe for nomic-embed-text 2048 limit)
17-
CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', '400'))
17+
CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', '512'))
1818
CHUNK_MAX_TOKENS = int(os.getenv('CHUNK_MAX_TOKENS', '1500'))
1919

2020
# Custom exception for chunking failures
@@ -23,22 +23,34 @@ class ChunkingError(RuntimeError):
2323
pass
2424

2525

26+
# Global tiktoken encoding cache for performance
27+
_TIKTOKEN_ENC = None
28+
29+
30+
def _get_tiktoken_encoding(encoding_name: str = "cl100k_base"):
31+
"""Get cached tiktoken encoding for performance."""
32+
global _TIKTOKEN_ENC
33+
if _TIKTOKEN_ENC is None:
34+
_TIKTOKEN_ENC = tiktoken.get_encoding(encoding_name)
35+
return _TIKTOKEN_ENC
36+
37+
2638
def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
2739
"""
28-
Count tokens in text using tiktoken.
29-
40+
Count tokens in text using tiktoken (cached).
41+
3042
Args:
3143
text: Text to count tokens for
3244
encoding_name: Tiktoken encoding name
33-
45+
3446
Returns:
3547
Number of tokens
3648
"""
3749
try:
38-
enc = tiktoken.get_encoding(encoding_name)
50+
enc = _get_tiktoken_encoding(encoding_name)
3951
return len(enc.encode(text))
4052
except Exception as e:
41-
logger.warning(f"Token counting failed: {e}, using wordbased fallback")
53+
logger.warning(f"Token counting failed: {e}, using word-based fallback")
4254
# Fallback: approximate 1 token ≈ 1 word
4355
return len(text.split())
4456

@@ -179,6 +191,63 @@ def fine_chunk_text(
179191
raise ChunkingError(str(e))
180192

181193

194+
def semchunk_text(
195+
text: str,
196+
target_tokens: int = 512,
197+
overlap_tokens: int = 50
198+
) -> list[dict]:
199+
"""
200+
Direct semantic chunking using semchunk.
201+
202+
Single-pass chunking that respects semantic boundaries.
203+
Simpler and faster than two-level chunking.
204+
205+
Args:
206+
text: Text to chunk
207+
target_tokens: Target size for chunks (tokens)
208+
overlap_tokens: Overlap between chunks (tokens)
209+
210+
Returns:
211+
List of chunk dictionaries with metadata
212+
"""
213+
if not text or len(text.strip()) == 0:
214+
return []
215+
216+
try:
217+
from semchunk import chunkerify
218+
219+
# Create chunker with cached tiktoken encoding
220+
chunker = chunkerify("cl100k_base", chunk_size=target_tokens)
221+
222+
# Chunk the text directly
223+
chunk_texts = chunker(text, overlap=overlap_tokens)
224+
225+
# Build chunk dictionaries with metadata
226+
chunks = []
227+
for idx, chunk_text in enumerate(chunk_texts):
228+
if not chunk_text or len(chunk_text.strip()) == 0:
229+
continue
230+
231+
token_count = count_tokens(chunk_text)
232+
chunks.append({
233+
'text': chunk_text,
234+
'semantic_block_index': 0, # Single block for direct chunking
235+
'chunk_index': idx,
236+
'token_count': token_count,
237+
'chunking_method': 'semchunk'
238+
})
239+
240+
logger.info(f"Semchunk created {len(chunks)} chunks from text")
241+
return chunks
242+
243+
except ImportError as e:
244+
logger.error("Semchunk not installed, cannot perform chunking")
245+
raise ChunkingError("Semchunk not installed")
246+
except Exception as e:
247+
logger.warning(f"Semchunk failed: {e}")
248+
raise ChunkingError(str(e))
249+
250+
182251
def _fallback_chunk(
183252
blocks: list[str],
184253
target_tokens: int,
@@ -244,16 +313,15 @@ def create_chunks(
244313
max_tokens: int = None
245314
) -> list[dict]:
246315
"""
247-
Create chunks from text using two-level semantic chunking (chonkie + semchunk).
316+
Create chunks from text using semantic chunking (semchunk only).
248317
249-
Pipeline:
250-
1. Chonkie TokenChunker: creates macro-semantic blocks (2x target size)
251-
2. Semchunk: refines into fine-grained embedding-ready chunks
252-
3. Filter: removes too short/long chunks
318+
Simplified pipeline:
319+
1. Semchunk: creates embedding-ready chunks respecting semantic boundaries
320+
2. Filter: removes too short/long chunks
253321
254322
Args:
255323
text: Text to chunk
256-
chunk_size: Target chunk size in tokens (default: CHUNK_SIZE env var or 400)
324+
chunk_size: Target chunk size in tokens (default: CHUNK_SIZE env var or 512)
257325
chunk_overlap: Overlap between chunks in tokens (default: 50)
258326
min_tokens: Minimum chunk size to keep (default: 0)
259327
max_tokens: Maximum chunk size before re-chunking (default: CHUNK_MAX_TOKENS env var or 1500)
@@ -271,32 +339,26 @@ def create_chunks(
271339
return []
272340

273341
try:
274-
# Level 1: Chonkie semantic chunking (macro blocks)
275-
macro_chunks = semantic_chunk_text(
342+
# Direct semchunk - no need for two-level chunking
343+
# Semchunk already handles semantic boundaries well
344+
chunks = semchunk_text(
276345
text,
277-
chunk_size=chunk_size * 2, # Larger blocks first
278-
chunk_overlap=chunk_overlap
279-
)
280-
281-
if not macro_chunks:
282-
logger.warning("No macro chunks created, using fallback")
283-
return _fallback_chunk([text], chunk_size, chunk_overlap)
284-
285-
# Level 2: Semchunk fine-grained chunking
286-
fine_chunks = fine_chunk_text(
287-
macro_chunks,
288346
target_tokens=chunk_size,
289347
overlap_tokens=chunk_overlap
290348
)
291349

292-
# Level 3: Filter and validate
350+
if not chunks:
351+
logger.warning("No chunks created, using fallback")
352+
return _fallback_chunk([text], chunk_size, chunk_overlap)
353+
354+
# Filter and validate
293355
valid_chunks = filter_chunks(
294-
fine_chunks,
356+
chunks,
295357
min_tokens=min_tokens,
296358
max_tokens=max_tokens
297359
)
298360

299-
logger.info(f"Created {len(valid_chunks)} chunks (chonkie+semchunk pipeline)")
361+
logger.info(f"Created {len(valid_chunks)} chunks (semchunk pipeline)")
300362
return valid_chunks
301363

302364
except ChunkingError as e:
@@ -319,7 +381,7 @@ def filter_chunks(
319381
chunks: List of chunk dictionaries
320382
min_tokens: Minimum token count - 0 = keep all (default)
321383
max_tokens: Maximum token count (default: CHUNK_MAX_TOKENS env var or 1500)
322-
384+
323385
Returns:
324386
Filtered list of valid chunks
325387
"""
@@ -329,7 +391,11 @@ def filter_chunks(
329391
valid_chunks = []
330392

331393
for chunk in chunks:
332-
token_count = chunk.get('token_count', count_tokens(chunk['text']))
394+
# Use cached token_count if available, only count if missing
395+
token_count = chunk.get('token_count')
396+
if token_count is None:
397+
token_count = count_tokens(chunk['text'])
398+
chunk['token_count'] = token_count # Cache for later use
333399

334400
if token_count < min_tokens:
335401
logger.debug(f"Discarding too short chunk: {token_count} tokens")

lib/config.py

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,9 @@ class ExtractionConfig(BaseModel):
2424
max_file_size: int = Field(default=100 * 1024 * 1024, description="Maximum file size in bytes")
2525

2626

27-
class ChunkingStrategyConfig(BaseModel):
28-
"""Chunking strategies per file type."""
29-
30-
markdown: str = Field(default="semantic", description="Strategy for markdown files")
31-
code: str = Field(default="syntax_aware", description="Strategy for code files")
32-
pdf: str = Field(default="page_aware", description="Strategy for PDF files")
33-
default: str = Field(default="semantic", description="Default strategy")
34-
35-
3627
class ChunkingConfig(BaseModel):
37-
"""Configuration for text chunking."""
28+
"""Configuration for text chunking (single-pass semchunk)."""
3829

39-
strategies: ChunkingStrategyConfig = Field(default_factory=ChunkingStrategyConfig)
4030
chunk_size: int = Field(default=512, description="Target chunk size in tokens")
4131
overlap: int = Field(default=50, description="Overlap between chunks in tokens")
4232
max_tokens: int = Field(default=2048, description="Maximum tokens per chunk (nomic-embed-text limit)")
@@ -47,7 +37,7 @@ class EmbeddingConfig(BaseModel):
4737

4838
provider: str = Field(default="ollama", description="Embedding provider")
4939
model: str = Field(default="nomic-embed-text", description="Embedding model name")
50-
batch_size: int = Field(default=10, description="Batch size for embedding")
40+
batch_size: int = Field(default=32, description="Batch size for embedding (optimized)")
5141
url: Optional[str] = Field(default=None, description="Ollama/API URL")
5242

5343
@field_validator('url')
@@ -68,7 +58,7 @@ class QdrantConfig(BaseModel):
6858
"""Configuration for Qdrant vector database."""
6959

7060
collection: str = Field(default="documentation", description="Collection name")
71-
batch_size: int = Field(default=10, description="Batch upload size")
61+
batch_size: int = Field(default=100, description="Batch upload size (optimized)")
7262
url: str = Field(default_factory=_get_qdrant_url, description="Qdrant URL")
7363
api_key: Optional[str] = Field(default_factory=_get_qdrant_api_key, description="API key if required")
7464

@@ -259,24 +249,20 @@ def merge_cli_args(config: RagifyConfig, args: dict) -> RagifyConfig:
259249
max_file_size: 104857600 # 100MB
260250
261251
chunking:
262-
strategies:
263-
markdown: semantic
264-
code: syntax_aware
265-
pdf: page_aware
266-
default: semantic
252+
# Single-pass semantic chunking with semchunk
267253
chunk_size: 512
268254
overlap: 50
269255
max_tokens: 2048
270256
271257
embedding:
272258
provider: ollama
273-
model: nomic-embed-text
274-
batch_size: 10
259+
model: nomic-embed-text # Configurable via EMBEDDING_MODEL env
260+
batch_size: 32 # Optimized for throughput
275261
# url: http://localhost:11434 # Defaults from env OLLAMA_URL
276262
277263
qdrant:
278264
collection: documentation
279-
batch_size: 10
265+
batch_size: 100 # Optimized for throughput
280266
# url: http://localhost:6333 # Defaults from env QDRANT_URL
281267
# api_key: null # Defaults from env QDRANT_API_KEY
282268

0 commit comments

Comments
 (0)