@@ -258,60 +258,118 @@ def get_relevant(self):
258258 )
259259
260260 def get_table_areas (self , textlines , relevant_textedges ):
261- """Return a dict of interesting table areas on the PDF page.
261+ """
262+ Return a dict of interesting table areas on the PDF page.
262263
263264 The table areas are calculated using relevant text edges.
264- """
265265
266- def pad (area , average_row_height ):
267- x0 = area [0 ] - TABLE_AREA_PADDING
268- y0 = area [1 ] - TABLE_AREA_PADDING
269- x1 = area [2 ] + TABLE_AREA_PADDING
270- # add a constant since table headers can be relatively up
271- y1 = area [3 ] + average_row_height * 5
272- return (x0 , y0 , x1 , y1 )
266+ Parameters
267+ ----------
268+ textlines : list
269+ List of text line objects that are relevant for determining table areas.
270+ relevant_textedges : list
271+ List of relevant text edge objects used to identify table areas.
272+
273+ Returns
274+ -------
275+ dict
276+ A dictionary with padded table areas as keys and None as values.
277+ """
273278
274- # sort relevant textedges in reading order
279+ # Sort relevant text edges in reading order
275280 relevant_textedges .sort (key = lambda te : (- te .y0 , te .coord ))
276281
282+ table_areas = self ._initialize_table_areas (relevant_textedges )
283+ self ._extend_table_areas_with_textlines (table_areas , textlines )
284+
285+ # Add padding to table areas
286+ average_textline_height = self ._calculate_average_textline_height (textlines )
287+ padded_table_areas = {
288+ self ._pad (area , average_textline_height ): None for area in table_areas
289+ }
290+
291+ return padded_table_areas
292+
293+ def _initialize_table_areas (self , relevant_textedges ):
294+ """
295+ Initialize table areas based on relevant text edges.
296+
297+ Parameters
298+ ----------
299+ relevant_textedges : list
300+ List of relevant text edge objects used to initialize table areas.
301+
302+ Returns
303+ -------
304+ dict
305+ A dictionary of table areas initialized from relevant text edges.
306+ """
277307 table_areas = {}
278308 for te in relevant_textedges :
279309 if not table_areas :
280310 table_areas [(te .coord , te .y0 , te .coord , te .y1 )] = None
281311 else :
282- found = None
283- for area in table_areas :
284- # check for overlap
285- if te .y1 >= area [1 ] and te .y0 <= area [3 ]:
286- found = area
287- break
288- if found is None :
289- table_areas [(te .coord , te .y0 , te .coord , te .y1 )] = None
290- else :
291- table_areas .pop (found )
292- updated_area = (
293- found [0 ],
294- min (te .y0 , found [1 ]),
295- max (found [2 ], te .coord ),
296- max (found [3 ], te .y1 ),
297- )
298- table_areas [updated_area ] = None
299-
300- # extend table areas based on textlines that overlap
301- # vertically. it's possible that these textlines were
302- # eliminated during textedges generation since numbers and
303- # chars/words/sentences are often aligned differently.
304- # drawback: table areas that have paragraphs on their sides
305- # will include the paragraphs too.
306- sum_textline_height = 0
312+ self ._update_table_areas (table_areas , te )
313+
314+ return table_areas
315+
316+ def _update_table_areas (self , table_areas , te ):
317+ """
318+ Update table areas by checking for overlaps with new text edges.
319+
320+ Parameters
321+ ----------
322+ table_areas : dict
323+ Current table areas to be updated.
324+ te : object
325+ The new text edge object to check for overlaps.
326+
327+ Returns
328+ -------
329+ None
330+ """
331+ found = None
332+ for area in table_areas :
333+ # Check for overlap
334+ if te .y1 >= area [1 ] and te .y0 <= area [3 ]:
335+ found = area
336+ break
337+
338+ if found is None :
339+ table_areas [(te .coord , te .y0 , te .coord , te .y1 )] = None
340+ else :
341+ table_areas .pop (found )
342+ updated_area = (
343+ found [0 ],
344+ min (te .y0 , found [1 ]),
345+ max (found [2 ], te .coord ),
346+ max (found [3 ], te .y1 ),
347+ )
348+ table_areas [updated_area ] = None
349+
350+ def _extend_table_areas_with_textlines (self , table_areas , textlines ):
351+ """
352+ Extend table areas based on text lines that overlap vertically.
353+
354+ Parameters
355+ ----------
356+ table_areas : dict
357+ Current table areas to be extended.
358+ textlines : list
359+ List of text line objects relevant for extending table areas.
360+
361+ Returns
362+ -------
363+ None
364+ """
307365 for tl in textlines :
308- sum_textline_height += tl .y1 - tl .y0
309366 found = None
310367 for area in table_areas :
311- # check for overlap
368+ # Check for overlap
312369 if tl .y0 >= area [1 ] and tl .y1 <= area [3 ]:
313370 found = area
314371 break
372+
315373 if found is not None :
316374 table_areas .pop (found )
317375 updated_area = (
@@ -321,14 +379,46 @@ def pad(area, average_row_height):
321379 max (found [3 ], tl .y1 ),
322380 )
323381 table_areas [updated_area ] = None
324- average_textline_height = sum_textline_height / float (len (textlines ))
325382
326- # add some padding to table areas
327- table_areas_padded = {}
328- for area in table_areas :
329- table_areas_padded [pad (area , average_textline_height )] = None
383+ def _calculate_average_textline_height (self , textlines ):
384+ """
385+ Calculate the average height of text lines.
330386
331- return table_areas_padded
387+ Parameters
388+ ----------
389+ textlines : list
390+ List of text line objects.
391+
392+ Returns
393+ -------
394+ float
395+ The average height of the text lines.
396+ """
397+ sum_textline_height = sum (tl .y1 - tl .y0 for tl in textlines )
398+ return sum_textline_height / float (len (textlines )) if textlines else 0
399+
400+ def _pad (self , area , average_row_height ):
401+ """
402+ Pad a given area by a constant value.
403+
404+ Parameters
405+ ----------
406+ area : tuple
407+ The area to be padded defined as (x0, y0, x1, y1).
408+ average_row_height : float
409+ The average height of rows to use for padding.
410+
411+ Returns
412+ -------
413+ tuple
414+ The padded area.
415+ """
416+ x0 = area [0 ] - TABLE_AREA_PADDING
417+ y0 = area [1 ] - TABLE_AREA_PADDING
418+ x1 = area [2 ] + TABLE_AREA_PADDING
419+ # Add a constant since table headers can be relatively up
420+ y1 = area [3 ] + average_row_height * 5
421+ return (x0 , y0 , x1 , y1 )
332422
333423
334424class Cell :
0 commit comments