@@ -18,7 +18,35 @@ class PDFProcessor:
18
18
def __init__ (self , tokenizer : str = "BAAI/bge-small-en-v1.5" ):
19
19
"""Initialize PDF processor with Docling components"""
20
20
self .converter = DocumentConverter ()
21
- self .chunker = HybridChunker (tokenizer = tokenizer )
21
+ self .chunker = HybridChunker (tokenizer = tokenizer , max_chunk_size = 384 ) # Reduced chunk size
22
+
23
+ def _extract_metadata (self , meta : Any ) -> Dict [str , Any ]:
24
+ """Safely extract metadata from various object types"""
25
+ try :
26
+ if hasattr (meta , '__dict__' ):
27
+ # If it's an object with attributes
28
+ return {
29
+ "headings" : getattr (meta , "headings" , []),
30
+ "page_numbers" : self ._extract_page_numbers (meta )
31
+ }
32
+ elif isinstance (meta , dict ):
33
+ # If it's a dictionary
34
+ return {
35
+ "headings" : meta .get ("headings" , []),
36
+ "page_numbers" : self ._extract_page_numbers (meta )
37
+ }
38
+ else :
39
+ # Default empty metadata
40
+ return {
41
+ "headings" : [],
42
+ "page_numbers" : []
43
+ }
44
+ except Exception as e :
45
+ print (f"Warning: Error extracting metadata: { str (e )} " )
46
+ return {
47
+ "headings" : [],
48
+ "page_numbers" : []
49
+ }
22
50
23
51
def process_pdf (self , file_path : str | Path ) -> List [Dict [str , Any ]]:
24
52
"""Process a PDF file and return chunks of text with metadata"""
@@ -38,13 +66,12 @@ def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
38
66
text = chunk .text if hasattr (chunk , 'text' ) else chunk .get ('text' , '' )
39
67
meta = chunk .meta if hasattr (chunk , 'meta' ) else chunk .get ('meta' , {})
40
68
69
+ metadata = self ._extract_metadata (meta )
70
+ metadata ["source" ] = str (file_path )
71
+
41
72
processed_chunk = {
42
73
"text" : text ,
43
- "metadata" : {
44
- "source" : str (file_path ),
45
- "headings" : meta .get ("headings" , []),
46
- "page_numbers" : self ._extract_page_numbers (meta ),
47
- }
74
+ "metadata" : metadata
48
75
}
49
76
processed_chunks .append (processed_chunk )
50
77
@@ -71,13 +98,12 @@ def process_pdf_url(self, url: str) -> List[Dict[str, Any]]:
71
98
text = chunk .text if hasattr (chunk , 'text' ) else chunk .get ('text' , '' )
72
99
meta = chunk .meta if hasattr (chunk , 'meta' ) else chunk .get ('meta' , {})
73
100
101
+ metadata = self ._extract_metadata (meta )
102
+ metadata ["source" ] = url
103
+
74
104
processed_chunk = {
75
105
"text" : text ,
76
- "metadata" : {
77
- "source" : url ,
78
- "headings" : meta .get ("headings" , []),
79
- "page_numbers" : self ._extract_page_numbers (meta ),
80
- }
106
+ "metadata" : metadata
81
107
}
82
108
processed_chunks .append (processed_chunk )
83
109
@@ -101,16 +127,35 @@ def process_directory(self, directory: str | Path) -> List[Dict[str, Any]]:
101
127
102
128
return all_chunks
103
129
104
- def _extract_page_numbers (self , meta : Dict ) -> List [int ]:
130
+ def _extract_page_numbers (self , meta : Any ) -> List [int ]:
105
131
"""Extract page numbers from chunk metadata"""
106
132
page_numbers = set ()
107
- if "doc_items" in meta :
108
- for item in meta ["doc_items" ]:
109
- if "prov" in item :
110
- for prov in item ["prov" ]:
111
- if "page_no" in prov :
112
- page_numbers .add (prov ["page_no" ])
113
- return sorted (list (page_numbers ))
133
+ try :
134
+ if hasattr (meta , 'doc_items' ):
135
+ items = meta .doc_items
136
+ elif isinstance (meta , dict ) and 'doc_items' in meta :
137
+ items = meta ['doc_items' ]
138
+ else :
139
+ return []
140
+
141
+ for item in items :
142
+ if hasattr (item , 'prov' ):
143
+ provs = item .prov
144
+ elif isinstance (item , dict ) and 'prov' in item :
145
+ provs = item ['prov' ]
146
+ else :
147
+ continue
148
+
149
+ for prov in provs :
150
+ if hasattr (prov , 'page_no' ):
151
+ page_numbers .add (prov .page_no )
152
+ elif isinstance (prov , dict ) and 'page_no' in prov :
153
+ page_numbers .add (prov ['page_no' ])
154
+
155
+ return sorted (list (page_numbers ))
156
+ except Exception as e :
157
+ print (f"Warning: Error extracting page numbers: { str (e )} " )
158
+ return []
114
159
115
160
def main ():
116
161
parser = argparse .ArgumentParser (description = "Process PDF files and extract text chunks" )
0 commit comments