Skip to content

Commit 5d9b4bc

Browse files
chore(release): rilascia versione 1.2.2
1 parent 4fd2a2f commit 5d9b4bc

File tree

3 files changed

+38
-10
lines changed

3 files changed

+38
-10
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ and the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.
77

88
## [Unreleased]
99

10+
## [1.2.2] - 2025-12-03
11+
12+
### Added
13+
- Chunking parameters now configurable via environment variables: `CHUNK_SIZE` (default 400) and `CHUNK_MAX_TOKENS` (default 1500)
14+
15+
### Fixed
16+
- Reduced default chunk size to prevent Ollama panic with nomic-embed-text model (2048 token context limit)
17+
1018
## [1.2.1] - 2025-12-02
1119

1220
### Fixed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ docker compose up -d
9292
| `BASE_URL` | `http://localhost:8080` | Public URL for OAuth callbacks |
9393
| `API_PORT` | `8080` | API and Web UI port |
9494
| `OLLAMA_MODEL` | `nomic-embed-text` | Embedding model |
95+
| `CHUNK_SIZE` | `400` | Target chunk size in tokens |
96+
| `CHUNK_MAX_TOKENS` | `1500` | Maximum chunk size (safe margin for nomic-embed-text 2048 limit) |
9597

9698
## Features
9799

lib/chunking.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,18 @@
55
"""
66

77
import logging
8+
import os
89
from typing import Optional, TypedDict, List
910
import tiktoken
1011

1112
logger = logging.getLogger(__name__)
1213

14+
# Chunking configuration from environment variables
15+
# CHUNK_SIZE: Target chunk size in tokens (default: 400)
16+
# CHUNK_MAX_TOKENS: Maximum chunk size before re-chunking (default: 1500, safe for nomic-embed-text 2048 limit)
17+
CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', '400'))
18+
CHUNK_MAX_TOKENS = int(os.getenv('CHUNK_MAX_TOKENS', '1500'))
19+
1320
# Custom exception for chunking failures
1421
class ChunkingError(RuntimeError):
1522
"""Raised when semantic chunking cannot be performed."""
@@ -212,27 +219,29 @@ def _fallback_chunk(
212219
return final_chunks
213220

214221

215-
def validate_chunk_size(chunk_text: str, max_tokens: int = 2048) -> bool:
222+
def validate_chunk_size(chunk_text: str, max_tokens: int = None) -> bool:
216223
"""
217224
Validate that chunk doesn't exceed embedding model's token limit.
218225
219226
Args:
220227
chunk_text: Chunk text to validate
221-
max_tokens: Maximum allowed tokens (nomic-embed-text: 2048)
228+
max_tokens: Maximum allowed tokens (default: CHUNK_MAX_TOKENS env var or 1500)
222229
223230
Returns:
224231
True if chunk is within limits
225232
"""
233+
if max_tokens is None:
234+
max_tokens = CHUNK_MAX_TOKENS
226235
token_count = count_tokens(chunk_text)
227236
return token_count <= max_tokens
228237

229238

230239
def create_chunks(
231240
text: str,
232-
chunk_size: int = 500,
241+
chunk_size: int = None,
233242
chunk_overlap: int = 50,
234243
min_tokens: int = 0,
235-
max_tokens: int = 2048
244+
max_tokens: int = None
236245
) -> list[dict]:
237246
"""
238247
Create chunks from text using two-level semantic chunking (chonkie + semchunk).
@@ -244,14 +253,20 @@ def create_chunks(
244253
245254
Args:
246255
text: Text to chunk
247-
chunk_size: Target chunk size in tokens (default: 500)
256+
chunk_size: Target chunk size in tokens (default: CHUNK_SIZE env var or 400)
248257
chunk_overlap: Overlap between chunks in tokens (default: 50)
249-
min_tokens: Minimum chunk size to keep (default: 50)
250-
max_tokens: Maximum chunk size before re-chunking (default: 2048)
258+
min_tokens: Minimum chunk size to keep (default: 0)
259+
max_tokens: Maximum chunk size before re-chunking (default: CHUNK_MAX_TOKENS env var or 1500)
251260
252261
Returns:
253262
List of chunk dictionaries with text and metadata
254263
"""
264+
# Apply defaults from environment variables
265+
if chunk_size is None:
266+
chunk_size = CHUNK_SIZE
267+
if max_tokens is None:
268+
max_tokens = CHUNK_MAX_TOKENS
269+
255270
if not text or len(text.strip()) == 0:
256271
return []
257272

@@ -295,21 +310,24 @@ def create_chunks(
295310
def filter_chunks(
296311
chunks: list[dict],
297312
min_tokens: int = 0,
298-
max_tokens: int = 8192
313+
max_tokens: int = None
299314
) -> list[dict]:
300315
"""
301316
Filter chunks by token count.
302317
303318
Args:
304319
chunks: List of chunk dictionaries
305320
min_tokens: Minimum token count - 0 = keep all (default)
306-
max_tokens: Maximum token count (needs re-chunking)
321+
max_tokens: Maximum token count (default: CHUNK_MAX_TOKENS env var or 1500)
307322
308323
Returns:
309324
Filtered list of valid chunks
310325
"""
326+
if max_tokens is None:
327+
max_tokens = CHUNK_MAX_TOKENS
328+
311329
valid_chunks = []
312-
330+
313331
for chunk in chunks:
314332
token_count = chunk.get('token_count', count_tokens(chunk['text']))
315333

0 commit comments

Comments
 (0)