@@ -22,8 +22,14 @@ def is_url(string: str) -> bool:
22
22
class PDFProcessor :
23
23
def __init__ (self , tokenizer : str = "BAAI/bge-small-en-v1.5" ):
24
24
"""Initialize PDF processor with Docling components"""
25
+ # Suppress CUDA compilation warnings
26
+ warnings .filterwarnings ('ignore' , category = UserWarning , module = 'torch.utils.cpp_extension' )
27
+ # Suppress token length warnings
28
+ warnings .filterwarnings ('ignore' , category = UserWarning , module = 'transformers.generation.utils' )
29
+ warnings .filterwarnings ('ignore' , category = UserWarning , module = 'transformers.modeling_utils' )
30
+
25
31
self .converter = DocumentConverter ()
26
- self .chunker = HybridChunker (tokenizer = tokenizer , max_chunk_size = 384 ) # Reduced chunk size
32
+ self .chunker = HybridChunker (tokenizer = tokenizer , max_chunk_size = 256 ) # Reduced chunk size for token length
27
33
28
34
def _extract_metadata (self , meta : Any ) -> Dict [str , Any ]:
29
35
"""Safely extract metadata from various object types"""
@@ -61,8 +67,14 @@ def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
61
67
if not conv_result or not conv_result .document :
62
68
raise ValueError (f"Failed to convert PDF: { file_path } " )
63
69
64
- # Chunk the document
65
- chunks = list (self .chunker .chunk (conv_result .document ))
70
+ # Chunk the document with error handling
71
+ try :
72
+ chunks = list (self .chunker .chunk (conv_result .document ))
73
+ except Exception as chunk_error :
74
+ print (f"Warning: Error during chunking: { str (chunk_error )} " )
75
+ # Fallback to smaller chunk size if needed
76
+ self .chunker .max_chunk_size = 128
77
+ chunks = list (self .chunker .chunk (conv_result .document ))
66
78
67
79
# Process chunks into a standardized format
68
80
processed_chunks = []
0 commit comments