@@ -34,12 +34,16 @@ def process_pdf(self, file_path: str | Path) -> List[Dict[str, Any]]:
34
34
# Process chunks into a standardized format
35
35
processed_chunks = []
36
36
for chunk in chunks :
37
+ # Handle both dictionary and DocChunk objects
38
+ text = chunk .text if hasattr (chunk , 'text' ) else chunk .get ('text' , '' )
39
+ meta = chunk .meta if hasattr (chunk , 'meta' ) else chunk .get ('meta' , {})
40
+
37
41
processed_chunk = {
38
- "text" : chunk [ " text" ] ,
42
+ "text" : text ,
39
43
"metadata" : {
40
44
"source" : str (file_path ),
41
- "headings" : chunk [ " meta" ] .get ("headings" , []),
42
- "page_numbers" : self ._extract_page_numbers (chunk [ " meta" ] ),
45
+ "headings" : meta .get ("headings" , []),
46
+ "page_numbers" : self ._extract_page_numbers (meta ),
43
47
}
44
48
}
45
49
processed_chunks .append (processed_chunk )
@@ -63,12 +67,16 @@ def process_pdf_url(self, url: str) -> List[Dict[str, Any]]:
63
67
# Process chunks into a standardized format
64
68
processed_chunks = []
65
69
for chunk in chunks :
70
+ # Handle both dictionary and DocChunk objects
71
+ text = chunk .text if hasattr (chunk , 'text' ) else chunk .get ('text' , '' )
72
+ meta = chunk .meta if hasattr (chunk , 'meta' ) else chunk .get ('meta' , {})
73
+
66
74
processed_chunk = {
67
- "text" : chunk [ " text" ] ,
75
+ "text" : text ,
68
76
"metadata" : {
69
77
"source" : url ,
70
- "headings" : chunk [ " meta" ] .get ("headings" , []),
71
- "page_numbers" : self ._extract_page_numbers (chunk [ " meta" ] ),
78
+ "headings" : meta .get ("headings" , []),
79
+ "page_numbers" : self ._extract_page_numbers (meta ),
72
80
}
73
81
}
74
82
processed_chunks .append (processed_chunk )
@@ -115,6 +123,10 @@ def main():
115
123
processor = PDFProcessor (tokenizer = args .tokenizer )
116
124
117
125
try :
126
+ # Create output directory if it doesn't exist
127
+ output_dir = Path (args .output ).parent
128
+ output_dir .mkdir (parents = True , exist_ok = True )
129
+
118
130
if is_url (args .input ):
119
131
print (f"\n Processing PDF from URL: { args .input } " )
120
132
print ("=" * 50 )
0 commit comments