@@ -780,21 +780,10 @@ def __init__(
780780 )
781781
782782 def _generate_table_bbox (self ):
783- user_provided_bboxes = None
784- if self .table_areas is not None :
785- # User gave us table areas already. We will use their coordinates
786- # to find column anchors.
787- user_provided_bboxes = []
788- for area_str in self .table_areas :
789- user_provided_bboxes .append (bbox_from_str (area_str ))
783+ user_provided_bboxes = self ._get_user_provided_bboxes ()
790784
791785 # Take all the textlines that are not just spaces
792- all_textlines = [
793- t
794- for t in self .horizontal_text + self .vertical_text
795- if len (t .get_text ().strip ()) > 0
796- ]
797- textlines = self ._apply_regions_filter (all_textlines )
786+ textlines = self ._get_filtered_textlines ()
798787
799788 textlines_processed = {}
800789 self .table_bbox_parses = {}
@@ -873,11 +862,34 @@ def _generate_table_bbox(self):
873862 self .parse_details ["col_searches" ].append (table_parse )
874863
875864 # Remember what textlines we processed, and repeat
876- for textline in tls_in_bbox :
877- textlines_processed [textline ] = None
878865 textlines = list (
879866 filter (lambda textline : textline not in textlines_processed , textlines )
880867 )
868+ self ._mark_processed_textlines (tls_in_bbox , textlines_processed , textlines )
869+
870+ def _get_filtered_textlines (self ):
871+ all_textlines = [
872+ t
873+ for t in self .horizontal_text + self .vertical_text
874+ if len (t .get_text ().strip ()) > 0
875+ ]
876+ return self ._apply_regions_filter (all_textlines )
877+
878+ def _mark_processed_textlines (
879+ self , tls_in_bbox , textlines_processed , all_textlines
880+ ):
881+ for textline in tls_in_bbox :
882+ textlines_processed [textline ] = None
883+ all_textlines [:] = [
884+ textline
885+ for textline in all_textlines
886+ if textline not in textlines_processed
887+ ]
888+
889+ def _get_user_provided_bboxes (self ):
890+ if self .table_areas is not None :
891+ return [bbox_from_str (area_str ) for area_str in self .table_areas ]
892+ return None
881893
882894 def _generate_columns_and_rows (self , bbox , user_cols ):
883895 # select elements which lie within table_bbox
0 commit comments