fix(embedding): riduce EMBEDDING_BATCH_SIZE a 10 per evitare errori Ollama con upload grandi

strawberry-code · strawberry-code · commit f6f317b0d351 · 2025-12-04T15:23:41.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ and the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 
+## [1.3.1] - 2025-12-04
+
+### Fixed
+- Reduced default EMBEDDING_BATCH_SIZE from 32 to 10 to prevent Ollama "cannot decode batches" errors with large uploads
+- New `EMBEDDING_BATCH_SIZE` env var allows tuning for different Ollama configurations
+
 ## [1.3.0] - 2025-12-04
 
 ### Changed
diff --git a/README.md b/README.md
@@ -94,6 +94,7 @@ docker compose up -d
 | `OLLAMA_MODEL` | `nomic-embed-text` | Embedding model |
 | `CHUNK_SIZE` | `400` | Target chunk size in tokens |
 | `CHUNK_MAX_TOKENS` | `1500` | Maximum chunk size (safe margin for nomic-embed-text 2048 limit) |
+| `EMBEDDING_BATCH_SIZE` | `10` | Chunks per embedding API call (reduce if Ollama errors) |
 
 ## Features
 
diff --git a/lib/embedding.py b/lib/embedding.py
@@ -17,6 +17,7 @@
 OLLAMA_URL = os.getenv('OLLAMA_URL', 'http://localhost:11434')
 EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'nomic-embed-text')
 MAX_TOKENS = 2048  # nomic-embed-text context limit (configurable models may differ)
+EMBEDDING_BATCH_SIZE = int(os.getenv('EMBEDDING_BATCH_SIZE', '10'))  # Reduced from 32 to avoid Ollama batch decode errors
 
 
 def get_embedding(text: str, timeout: int = 60, max_retries: int = 3) -> Optional[list[float]]:
@@ -240,7 +241,7 @@ def safe_embed_chunk(
 def batch_embed_chunks(
     chunks: list[dict],
     max_tokens: int = MAX_TOKENS,
-    batch_size: int = 32
+    batch_size: int = EMBEDDING_BATCH_SIZE
 ) -> list[dict]:
     """
     Embed multiple chunks using batch API for better performance.
@@ -251,7 +252,7 @@ def batch_embed_chunks(
     Args:
         chunks: List of chunk dictionaries with 'text' key
         max_tokens: Maximum tokens per chunk
-        batch_size: Number of texts to embed in a single API call (default: 32)
+        batch_size: Number of texts to embed per API call (default: EMBEDDING_BATCH_SIZE env or 10)
 
     Returns:
         List of successfully embedded chunks (flattened if re-chunking occurred)