@@ -33,33 +33,29 @@ def __init__(
3333 visualize : bool = False ,
3434 visualization_save_dir : str = None ,
3535 ):
36- self ._pdf_extractor = pdf_extractor
37- self ._table_detector = table_detector
38- self ._structure_recognizer = structure_recognizer
39- self ._ocr_service = ocr_service
40- self ._validation_service = TableValidationService ( )
36+ self .pdf_extractor = pdf_extractor
37+ self .table_detector = table_detector
38+ self .structure_recognizer = structure_recognizer
39+ self .validation_service = TableValidationService ()
40+ self .grid_builder = TableGridBuilder ( ocr_service )
4141 self ._visualize = visualize
4242 self ._visualization_save_dir = visualization_save_dir
4343
4444 def extract_tables_from_page (
4545 self , pdf_path : str , page_number : int
4646 ) -> List [DetectedTable ]:
4747 """Extract all tables from a PDF page."""
48- # Extract page image
49- page_image = self ._pdf_extractor .extract_page_image (pdf_path , page_number )
48+ page_image = self .pdf_extractor .extract_page_image (pdf_path , page_number )
5049
51- # Detect tables
52- detected_tables = self ._table_detector .detect_tables (page_image )
50+ detected_tables = self .table_detector .detect_tables (page_image )
5351
54- # Visualization: Table detection
5552 if self ._visualize :
5653 visualize_table_detection (
5754 page_image ,
5855 detected_tables ,
5956 visualization_save_dir = self ._visualization_save_dir ,
6057 )
6158
62- # Process each detected table
6359 structured_tables = []
6460 for idx , table in enumerate (detected_tables ):
6561 try :
@@ -78,31 +74,25 @@ def _process_detected_table(
7874 self , page_image : PageImage , detected_table : DetectedTable , table_idx : int = 0
7975 ) -> Optional [DetectedTable ]:
8076 """Process a single detected table to extract its structure."""
81- # Recognize table structure
82- detected_cells = self ._structure_recognizer .recognize_structure (
77+ detected_cells = self .structure_recognizer .recognize_structure (
8378 page_image , detected_table .detection_box
8479 )
8580
86- # Visualization: Table structure
8781 if self ._visualize :
8882 visualize_table_structure (
8983 page_image ,
9084 detected_cells ,
9185 detected_table .detection_box ,
92- self ._visualization_save_dir
86+ self ._visualization_save_dir ,
9387 )
9488
95- # Validate detected structure
96- if not self ._validation_service .is_valid_table_structure (detected_cells ):
89+ if not self .validation_service .is_valid_table_structure (detected_cells ):
9790 return None
9891
99- # Build table grid
100- grid_builder = TableGridBuilder (self ._ocr_service )
101- table_grid = grid_builder .build_grid (
92+ table_grid = self .grid_builder .build_grid (
10293 detected_cells , page_image , detected_table .detection_box
10394 )
10495
105- # Visualization: Cell grid
10696 if self ._visualize and table_grid :
10797 visualize_cell_grid (
10898 table_grid ,
@@ -114,6 +104,5 @@ def _process_detected_table(
114104 if not table_grid :
115105 return None
116106
117- # Update detected table with grid
118107 detected_table .grid = table_grid
119108 return detected_table
0 commit comments