11"""
22PDF Utilities Module
33
4- Provides utility functions for creating test PDFs.
4+ Provides utility functions for creating test PDFs and extracting text .
55"""
66
77import re
88from pathlib import Path
9- from typing import List
9+ from typing import List , Tuple , Optional
1010
1111import pypdf
12+ from ..utils import get_logger
13+
14+ logger = get_logger (__name__ )
1215
1316
1417def create_pdf (pages : List [str ]) -> bytes :
@@ -136,4 +139,117 @@ def is_textual_pdf(
136139
137140 except Exception :
138141 # If PDF can't be read, assume it's not textual
139- return 0.0
142+ return 0.0
143+
144+
145+ def extract_text_from_pdf (path : str | Path ) -> str :
146+ """
147+ Extract all text from a PDF file.
148+
149+ Args:
150+ path: Path to the PDF file
151+
152+ Returns:
153+ str: Extracted text from all pages
154+ """
155+ try :
156+ reader = pypdf .PdfReader (str (path ))
157+ text_parts = []
158+
159+ for page_num , page in enumerate (reader .pages ):
160+ try :
161+ text = page .extract_text ()
162+ if text .strip ():
163+ text_parts .append (text )
164+ except Exception as e :
165+ logger .debug (f"Failed to extract text from page { page_num } : { e } " )
166+ continue
167+
168+ return "\n \n " .join (text_parts )
169+
170+ except Exception as e :
171+ logger .warning (f"Failed to extract text from PDF { path } : { e } " )
172+ return ""
173+
174+
175+ def get_pdf_info (path : str | Path ) -> Tuple [int , bool , Optional [str ]]:
176+ """
177+ Get information about a PDF file for cost estimation.
178+
179+ Args:
180+ path: Path to the PDF file
181+
182+ Returns:
183+ Tuple of (page_count, is_textual, extracted_text)
184+ - page_count: Number of pages in the PDF
185+ - is_textual: Whether the PDF has extractable text
186+ - extracted_text: Text content if textual, None otherwise
187+ """
188+ try :
189+ reader = pypdf .PdfReader (str (path ))
190+ page_count = len (reader .pages )
191+
192+ # Check if PDF is textual
193+ textual_score = is_textual_pdf (path )
194+ is_textual = textual_score > 0.5 # Consider textual if >50% pages have text
195+
196+ # Extract text if textual
197+ extracted_text = None
198+ if is_textual :
199+ extracted_text = extract_text_from_pdf (path )
200+ if not extracted_text .strip ():
201+ is_textual = False
202+ extracted_text = None
203+
204+ logger .debug (f"PDF info for { path } : { page_count } pages, textual={ is_textual } , "
205+ f"text_length={ len (extracted_text ) if extracted_text else 0 } " )
206+
207+ return page_count , is_textual , extracted_text
208+
209+ except Exception as e :
210+ logger .error (f"Failed to get PDF info for { path } : { e } " )
211+ return 0 , False , None
212+
213+
214+ def estimate_pdf_tokens (path : str | Path , prompt : Optional [str ] = None ,
215+ pdf_token_multiplier : float = 1.5 ,
216+ tokens_per_page : int = 2000 ) -> int :
217+ """
218+ Estimate token count for a PDF file.
219+
220+ This is a generic utility that can be used by any provider to estimate
221+ tokens for PDF processing.
222+
223+ Args:
224+ path: Path to the PDF file
225+ prompt: Optional prompt to include in token count
226+ pdf_token_multiplier: Coefficient to apply to extracted text tokens
227+ to account for PDF processing overhead (default: 1.5)
228+ tokens_per_page: Estimated tokens per page for image-based PDFs (default: 2000)
229+
230+ Returns:
231+ Estimated token count
232+ """
233+ from .llm import token_count_simple
234+
235+ page_count , is_textual , extracted_text = get_pdf_info (path )
236+
237+ if is_textual and extracted_text :
238+ # Count tokens from extracted text
239+ base_tokens = token_count_simple (extracted_text )
240+ if prompt :
241+ base_tokens += token_count_simple (prompt )
242+
243+ # Apply multiplier to account for PDF processing overhead
244+ input_tokens = int (base_tokens * pdf_token_multiplier )
245+ logger .debug (f"Textual PDF { path } : { page_count } pages, "
246+ f"base tokens: { base_tokens } , with { pdf_token_multiplier } x multiplier: { input_tokens } " )
247+ else :
248+ # Estimate based on page count
249+ input_tokens = page_count * tokens_per_page
250+ if prompt :
251+ input_tokens += token_count_simple (prompt )
252+ logger .debug (f"Image-based PDF { path } : { page_count } pages, "
253+ f"estimated tokens: { input_tokens } ({ tokens_per_page } per page)" )
254+
255+ return input_tokens
0 commit comments