11from pdf2table .usecases .table_extraction_use_case import TableExtractionUseCase
2- from pdf2table .usecases .dtos import TableExtractionRequest
32from pdf2table .adaptors .table_extraction_adaptor import TableExtractionAdapter
43from pdf2table .frameworks .pdf_image_extractor import PyMuPDFImageExtractor
54from pdf2table .frameworks .table_transformer_detector import TableTransformerDetector
@@ -27,26 +26,28 @@ def create_table_extraction_adapter(
2726 visualization_save_dir : str = "data/table_visualizations" , # Optional save dir
2827 ) -> TableExtractionAdapter :
2928 """Create a fully configured table extraction adapter."""
30-
31- logger .info (f"Creating table extraction adapter - Device: { device } , "
32- f"Detection threshold: { detection_threshold } , "
33- f"Structure threshold: { structure_threshold } , "
34- f"PDF DPI: { pdf_dpi } , OCR: { load_ocr } , Visualize: { visualize } " )
29+
30+ logger .info (
31+ f"Creating table extraction adapter - Device: { device } , "
32+ f"Detection threshold: { detection_threshold } , "
33+ f"Structure threshold: { structure_threshold } , "
34+ f"PDF DPI: { pdf_dpi } , OCR: { load_ocr } , Visualize: { visualize } "
35+ )
3536
3637 # Create framework implementations (outermost layer)
3738 logger .debug ("Initializing PDF image extractor" )
3839 pdf_extractor = PyMuPDFImageExtractor (dpi = pdf_dpi )
39-
40+
4041 logger .debug ("Initializing table transformer detector" )
4142 table_detector = TableTransformerDetector (
4243 device = device , confidence_threshold = detection_threshold
4344 )
44-
45+
4546 logger .debug ("Initializing table structure recognizer" )
4647 structure_recognizer = TableTransformerStructureRecognizer (
4748 device = device , confidence_threshold = structure_threshold
4849 )
49-
50+
5051 if load_ocr :
5152 logger .debug ("Initializing OCR service" )
5253 ocr_service = TrOCRService (device = device )
@@ -66,7 +67,7 @@ def create_table_extraction_adapter(
6667
6768 logger .debug ("Creating table extraction adapter" )
6869 adapter = TableExtractionAdapter (table_extraction_use_case )
69-
70+
7071 logger .info ("Table extraction adapter created successfully" )
7172 return adapter
7273
@@ -87,12 +88,13 @@ def __init__(self, device: str = "cpu"):
8788 def extract_tables_from_page (self , pdf_path : str , page_number : int ) -> dict :
8889 """Extract tables from a single PDF page."""
8990 logger .info (f"Extracting tables from { pdf_path } , page { page_number } " )
90- request = TableExtractionRequest (pdf_path , page_number )
9191 try :
92- response = self ._adapter .extract_tables (request )
92+ response = self ._adapter .extract_tables (pdf_path , page_number )
9393 result = response .to_dict ()
94- tables_count = len (result .get ('tables' , []))
95- logger .info (f"Successfully extracted { tables_count } tables from page { page_number } " )
94+ tables_count = len (result .get ("tables" , []))
95+ logger .info (
96+ f"Successfully extracted { tables_count } tables from page { page_number } "
97+ )
9698 return result
9799 except Exception as e :
98100 logger .error (f"Failed to extract tables from page { page_number } : { e } " )
@@ -101,32 +103,23 @@ def extract_tables_from_page(self, pdf_path: str, page_number: int) -> dict:
101103 def extract_tables_from_pdf (self , pdf_path : str ) -> list [dict ]:
102104 """Extract tables from all pages of a PDF."""
103105 logger .info (f"Starting table extraction from entire PDF: { pdf_path } " )
104-
105- from pdf2table .frameworks .pdf_image_extractor import PyMuPDFImageExtractor
106-
107- # Get page count
108- pdf_extractor = PyMuPDFImageExtractor ()
109- page_count = pdf_extractor .get_page_count (pdf_path )
110- logger .info (f"PDF has { page_count } pages" )
111-
112- results = []
113- successful_pages = 0
114-
115- for page_number in range (page_count ):
116- try :
117- result = self .extract_tables_from_page (pdf_path , page_number )
118- results .append (result )
119- successful_pages += 1
120- except Exception as e :
121- logger .error (f"Failed to process page { page_number } : { e } " )
122- results .append (
123- {
124- "success" : False ,
125- "error" : str (e ),
126- "page_number" : page_number ,
127- "source_file" : pdf_path ,
128- }
106+
107+ try :
108+ response = self ._adapter .extract_tables (pdf_path )
109+ result = response .to_dict ()
110+
111+ if result .get ("success" ):
112+ tables_count = len (result .get ("tables" , []))
113+ logger .info (
114+ f"Successfully extracted { tables_count } tables from entire PDF"
129115 )
130-
131- logger .info (f"Completed PDF processing - { successful_pages } /{ page_count } pages successful" )
132- return results
116+ else :
117+ logger .error (
118+ f"Failed to extract tables from PDF: { result .get ('error' )} "
119+ )
120+
121+ return result
122+
123+ except Exception as e :
124+ logger .error (f"Failed to extract tables from PDF: { e } " )
125+ raise
0 commit comments