@@ -125,19 +125,105 @@ def find_closest_tls( # noqa: C901
125125 }
126126
127127
128- def search_header_from_body_bbox (body_bbox , textlines , col_anchors , max_v_gap ):
129- """Expand a bbox vertically up by looking for plausible headers.
128+ def _extract_zones (
129+ all_above : list [Any ], max_v_gap : float , top : float
130+ ) -> tuple [list [list [float ]], float ]:
131+ """Extract zones from the textlines above the body bbox.
132+
133+ Parameters
134+ ----------
135+ all_above : List[Any]
136+ Textlines that are above the bounding box.
137+ max_v_gap : float
138+ The maximum vertical gap allowed.
139+ top : float
140+ The current top boundary.
141+
142+ Returns
143+ -------
144+ Tuple[List[List[float]], float]
145+ The extracted zones and the new top boundary.
146+ """
147+ tls_in_new_row = []
148+ pushed_up = True
149+
150+ while pushed_up :
151+ pushed_up = False
152+ for (
153+ textline
154+ ) in all_above .copy (): # Copy to avoid modifying the list during iteration
155+ if textline .y0 < top :
156+ # The bottom of this element is within our row so we add it.
157+ tls_in_new_row .append (textline )
158+ all_above .remove (textline )
159+ if textline .y1 > top :
160+ # If the top of this element raises our row's
161+ # band, we'll need to keep on searching for
162+ # overlapping items
163+ top = textline .y1
164+ pushed_up = True
165+
166+ return [[textline .x0 , textline .x1 ] for textline in tls_in_new_row ], top
167+
168+
169+ def _merge_zones (zones : list [list [float ]]) -> list [list [float ]]:
170+ """Merge overlapping zones into consolidated zones.
171+
172+ Parameters
173+ ----------
174+ zones : List[List[float]]
175+ A list of zones defined by their x-coordinates.
176+
177+ Returns
178+ -------
179+ List[List[float]]
180+ A list of merged zones.
181+ """
182+ zones .sort (key = lambda z : z [0 ])
183+ merged_zones : list [list [float ]] = []
184+
185+ for zone in zones :
186+ if not merged_zones or merged_zones [- 1 ][1 ] < zone [0 ]:
187+ merged_zones .append (zone )
188+ else :
189+ merged_zones [- 1 ][1 ] = max (merged_zones [- 1 ][1 ], zone [1 ]) # Merge the zones
190+
191+ return merged_zones
192+
193+
194+ def search_header_from_body_bbox (
195+ body_bbox : tuple [float , float , float , float ],
196+ textlines : list [Any ],
197+ col_anchors : list [float ],
198+ max_v_gap : float ,
199+ ) -> tuple [float , float , float , float ]:
200+ """Expand a bounding box (bbox) vertically by looking for plausible headers.
130201
131202 The core algorithm is based on fairly strict alignment of text. It works
132- for the table body, but might fail on tables' headers since they tend to be
133- in a different font, alignment (e.g. vertical), etc.
134- This method evalutes the area above the table body's bbox for
135- characteristics of a table header: close to the top of the body, with cells
136- that fit within the horizontal bounds identified.
203+ for the table body but might fail on table headers since they tend to be
204+ in a different font, alignment (e.g., vertical), etc. This method evaluates
205+ the area above the table body's bbox for characteristics of a table header:
206+ close to the top of the body, with cells that fit within the horizontal bounds identified.
207+
208+ Parameters
209+ ----------
210+ body_bbox : Tuple[float, float, float, float]
211+ The bounding box of the body in the format (left, bottom, right, top).
212+ textlines : List[Any]
213+ A list of textline objects, each with properties x0, x1, y0, and y1.
214+ col_anchors : List[float]
215+ A list of x-coordinates representing column anchors.
216+ max_v_gap : float
217+ The maximum vertical gap allowed to consider a header plausible.
218+
219+ Returns
220+ -------
221+ Tuple[float, float, float, float]
222+ The expanded bounding box in the format (left, bottom, right, top).
137223 """
138224 new_bbox = body_bbox
139225 (left , bottom , right , top ) = body_bbox
140- zones = []
226+ zones : list [ list [ float ]] = []
141227
142228 keep_searching = True
143229 while keep_searching :
@@ -154,55 +240,15 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
154240 closest_above = min (all_above , key = lambda tl : tl .y0 , default = None )
155241
156242 if closest_above and closest_above .y0 < top + max_v_gap :
157- # b/ We have a candidate cell that is within the correct
243+ # We have a candidate cell that is within the correct
158244 # vertical band, and directly above the table. Starting from
159245 # this anchor, we list all the textlines within the same row.
160- tls_in_new_row = []
161- top = closest_above .y1
162- pushed_up = True
163- while pushed_up :
164- pushed_up = False
165- # Iterate and extract elements that fit in the row
166- # from our list
167- for i in range (len (all_above ) - 1 , - 1 , - 1 ):
168- textline = all_above [i ]
169- if textline .y0 < top :
170- # The bottom of this element is within our row
171- # so we add it.
172- tls_in_new_row .append (textline )
173- all_above .pop (i )
174- if textline .y1 > top :
175- # If the top of this element raises our row's
176- # band, we'll need to keep on searching for
177- # overlapping items
178- top = textline .y1
179- pushed_up = True
180-
181- # Get the x-ranges for all the textlines, and merge the
182- # x-ranges that overlap
183- zones = zones + list (
184- map (lambda textline : [textline .x0 , textline .x1 ], tls_in_new_row )
185- )
186- zones .sort (key = lambda z : z [0 ]) # Sort by left coordinate
187- # Starting from the right, if two zones overlap horizontally,
188- # merge them
189- merged_something = True
190- while merged_something :
191- merged_something = False
192- for i in range (len (zones ) - 1 , 0 , - 1 ):
193- zone_right = zones [i ]
194- zone_left = zones [i - 1 ]
195- if zone_left [1 ] >= zone_right [0 ]:
196- zone_left [1 ] = max (zone_right [1 ], zone_left [1 ])
197- zones .pop (i )
198- merged_something = True
246+ zones , top = _extract_zones (all_above , max_v_gap , closest_above .y1 )
247+ # Starting from the right, if two zones overlap horizontally, merge them
248+ merged_zones = _merge_zones (zones )
199249
200250 max_spread = max (
201- list (
202- map (
203- lambda zone : column_spread (zone [0 ], zone [1 ], col_anchors ), zones
204- )
205- )
251+ column_spread (zone [0 ], zone [1 ], col_anchors ) for zone in merged_zones
206252 )
207253
208254 # Accept textlines that cross columns boundaries, as long as they
0 commit comments