Skip to content

Commit f934e53

Browse files
committed
fix: correct HybridChunker initialization and chunk size setting
1 parent dd1ac41 commit f934e53

File tree

1 file changed

+8
-3
lines changed

1 file changed

+8
-3
lines changed

agentic_rag/pdf_processor.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self, tokenizer: str = "BAAI/bge-small-en-v1.5"):
3030
warnings.filterwarnings('ignore', category=UserWarning, module='transformers.modeling_utils')
3131

3232
self.converter = DocumentConverter()
33-
self.chunker = HybridChunker(tokenizer=tokenizer, max_chunk_size=200) # Further reduced chunk size
33+
self.tokenizer = tokenizer
3434

3535
def _extract_metadata(self, meta: Any) -> Dict[str, Any]:
3636
"""Safely extract metadata from various object types"""
@@ -63,8 +63,13 @@ def _extract_metadata(self, meta: Any) -> Dict[str, Any]:
6363
def _try_chunk_with_size(self, document: Any, chunk_size: int) -> List[Any]:
6464
"""Try chunking with a specific size, return None if it fails"""
6565
try:
66-
self.chunker.max_chunk_size = chunk_size
67-
return list(self.chunker.chunk(document))
66+
# Create a new chunker with the specified size
67+
chunker = HybridChunker(
68+
tokenizer=self.tokenizer,
69+
chunk_size=chunk_size,
70+
chunk_overlap=0.1
71+
)
72+
return list(chunker.chunk(document))
6873
except Exception as e:
6974
print(f"Warning: Chunking failed with size {chunk_size}: {str(e)}")
7075
return None

0 commit comments

Comments
 (0)