Skip to content

Commit bd7b8a7

Browse files
committed
fix: improve PDF processor error handling and reduce token size - Add CUDA compilation warning suppression - Reduce initial chunk size from 384 to 256 tokens - Add fallback mechanism - Suppress token length warnings - Improve chunking error handling
1 parent 4036328 commit bd7b8a7

File tree

1 file changed

+15
-3
lines changed

1 file changed

+15
-3
lines changed

agentic_rag/pdf_processor.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,14 @@ def is_url(string: str) -> bool:
2222
class PDFProcessor:
2323
def __init__(self, tokenizer: str = "BAAI/bge-small-en-v1.5"):
2424
"""Initialize PDF processor with Docling components"""
25+
# Suppress CUDA compilation warnings
26+
warnings.filterwarnings('ignore', category=UserWarning, module='torch.utils.cpp_extension')
27+
# Suppress token length warnings
28+
warnings.filterwarnings('ignore', category=UserWarning, module='transformers.generation.utils')
29+
warnings.filterwarnings('ignore', category=UserWarning, module='transformers.modeling_utils')
30+
2531
self.converter = DocumentConverter()
26-
self.chunker = HybridChunker(tokenizer=tokenizer, max_chunk_size=384) # Reduced chunk size
32+
self.chunker = HybridChunker(tokenizer=tokenizer, max_chunk_size=256) # Reduced chunk size for token length
2733

2834
def _extract_metadata(self, meta: Any) -> Dict[str, Any]:
2935
"""Safely extract metadata from various object types"""
@@ -61,8 +67,14 @@ def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
6167
if not conv_result or not conv_result.document:
6268
raise ValueError(f"Failed to convert PDF: {file_path}")
6369

64-
# Chunk the document
65-
chunks = list(self.chunker.chunk(conv_result.document))
70+
# Chunk the document with error handling
71+
try:
72+
chunks = list(self.chunker.chunk(conv_result.document))
73+
except Exception as chunk_error:
74+
print(f"Warning: Error during chunking: {str(chunk_error)}")
75+
# Fallback to smaller chunk size if needed
76+
self.chunker.max_chunk_size = 128
77+
chunks = list(self.chunker.chunk(conv_result.document))
6678

6779
# Process chunks into a standardized format
6880
processed_chunks = []

0 commit comments

Comments
 (0)