11"""General helper utilities to parse the pdf tables."""
22
3+ from __future__ import annotations
4+
35import atexit
46import math
57import os
1214from itertools import groupby
1315from operator import itemgetter
1416from pathlib import Path
17+ from typing import Any
18+ from typing import List
19+ from typing import Tuple
1520from typing import Union
1621from urllib .parse import urlparse as parse_url
1722from urllib .parse import uses_netloc
2631from pdfminer .layout import LTAnno
2732from pdfminer .layout import LTChar
2833from pdfminer .layout import LTImage
34+ from pdfminer .layout import LTTextLine
2935from pdfminer .layout import LTTextLineHorizontal
3036from pdfminer .layout import LTTextLineVertical
3137from pdfminer .pdfdocument import PDFDocument
@@ -83,7 +89,7 @@ def random_string(length):
8389 return ret
8490
8591
86- def download_url (url : str ) -> Union [ StrByteType , Path ] :
92+ def download_url (url : str ) -> StrByteType | Path :
8793 """Download file from specified URL.
8894
8995 Parameters
@@ -951,108 +957,127 @@ def flag_font_size(textline, direction, strip_text=""):
951957 return text_strip (fstring , strip_text )
952958
953959
954- def split_textline (table , textline , direction , flag_size = False , strip_text = "" ):
960+ def split_textline (
961+ table : Any ,
962+ textline : LTTextLine ,
963+ direction : str ,
964+ flag_size : bool = False ,
965+ strip_text : str = "" ,
966+ ) -> list [tuple [int , int , str ]]:
955967 """Split textline into substrings if it spans across multiple rows/columns.
956968
957969 Parameters
958970 ----------
959971 table : camelot.core.Table
960- textline : object
972+ The table structure containing rows and columns.
973+ textline : LTTextLine
961974 PDFMiner LTTextLine object.
962- direction : string
963- Direction of the PDFMiner LTTextLine object.
964- flag_size : bool, optional (default: False)
965- Whether or not to highlight a substring using <s></s>
966- if its size is different from rest of the string. (Useful for
967- super and subscripts.)
968- strip_text : str, optional (default: '')
969- Characters that should be stripped from a string before
970- assigning it to a cell.
975+ direction : str
976+ Direction of the PDFMiner LTTextLine object, either "horizontal" or "vertical".
977+ flag_size : bool, optional
978+ Whether to highlight a substring using <s></s> if its size differs from the rest of the string.
979+ strip_text : str, optional
980+ Characters to strip from a string before assigning it to a cell.
971981
972982 Returns
973983 -------
974- grouped_chars : list
975- List of tuples of the form (idx, text) where idx is the index
976- of row/column and text is the an LTTextLine substring.
977-
984+ List[tuple[int, int, str]]
985+ A list of tuples of the form (idx, text) where idx is the index of row/column
986+ and text is an LTTextLine substring.
978987 """
979- cut_text = []
988+ cut_text : list [ tuple [ int , int , LTChar | LTAnno | list [ Any ]]] = []
980989 bbox = textline .bbox
981- try :
982- if textline .is_empty ():
983- return [(- 1 , - 1 , textline .get_text ())]
984-
985- if direction == "horizontal" and not textline .is_empty ():
986- x_overlap = [
987- i
988- for i , x in enumerate (table .cols )
989- if x [0 ] <= bbox [2 ] and bbox [0 ] <= x [1 ]
990- ]
991- r_idx = [
992- j
993- for j , r in enumerate (table .rows )
994- if r [1 ] <= (bbox [1 ] + bbox [3 ]) / 2 <= r [0 ]
995- ]
996- r = r_idx [0 ]
997- x_cuts = [
998- (c , table .cells [r ][c ].x2 ) for c in x_overlap if table .cells [r ][c ].right
999- ]
1000- if not x_cuts :
1001- x_cuts = [(x_overlap [0 ], table .cells [r ][- 1 ].x2 )]
1002- for obj in textline ._objs :
1003- row = table .rows [r ]
1004- for cut in x_cuts :
1005- if isinstance (obj , LTChar ):
1006- if (
1007- row [1 ] <= (obj .y0 + obj .y1 ) / 2 <= row [0 ]
1008- and (obj .x0 + obj .x1 ) / 2 <= cut [1 ]
1009- ):
1010- cut_text .append ((r , cut [0 ], obj ))
1011- break
1012- else :
1013- # TODO: add test
1014- if cut == x_cuts [- 1 ]:
1015- new_idx = min (cut [0 ] + 1 , len (table .cols ) - 1 )
1016- cut_text .append ((r , new_idx , obj ))
1017- elif isinstance (obj , LTAnno ):
1018- cut_text .append ((r , cut [0 ], obj ))
1019- elif direction == "vertical" and not textline .is_empty ():
1020- y_overlap = [
1021- j
1022- for j , y in enumerate (table .rows )
1023- if y [1 ] <= bbox [3 ] and bbox [1 ] <= y [0 ]
1024- ]
1025- c_idx = [
1026- i
1027- for i , c in enumerate (table .cols )
1028- if c [0 ] <= (bbox [0 ] + bbox [2 ]) / 2 <= c [1 ]
1029- ]
1030- c = c_idx [0 ]
1031- y_cuts = [
1032- (r , table .cells [r ][c ].y1 ) for r in y_overlap if table .cells [r ][c ].bottom
1033- ]
1034- if not y_cuts :
1035- y_cuts = [(y_overlap [0 ], table .cells [- 1 ][c ].y1 )]
1036- for obj in textline ._objs :
1037- col = table .cols [c ]
1038- for cut in y_cuts :
1039- if isinstance (obj , LTChar ):
1040- if (
1041- col [0 ] <= (obj .x0 + obj .x1 ) / 2 <= col [1 ]
1042- and (obj .y0 + obj .y1 ) / 2 >= cut [1 ]
1043- ):
1044- cut_text .append ((cut [0 ], c , obj ))
1045- break
1046- else :
1047- # TODO: add test
1048- if cut == y_cuts [- 1 ]:
1049- new_idx = max (cut [0 ] - 1 , 0 )
1050- cut_text .append ((new_idx , c , obj ))
1051- elif isinstance (obj , LTAnno ):
1052- cut_text .append ((cut [0 ], c , obj ))
1053- except IndexError :
990+
991+ if textline .is_empty ():
1054992 return [(- 1 , - 1 , textline .get_text ())]
1055- grouped_chars = []
993+
994+ if direction == "horizontal" :
995+ cut_text = _process_horizontal_cut (table , textline , bbox )
996+ elif direction == "vertical" :
997+ cut_text = _process_vertical_cut (table , textline , bbox )
998+
999+ grouped_chars = _group_and_process_chars (cut_text , flag_size , direction , strip_text )
1000+ return grouped_chars
1001+
1002+
1003+ def _process_horizontal_cut (
1004+ table , textline , bbox
1005+ ) -> list [tuple [int , int , LTChar | LTAnno | list [Any ]]]:
1006+ """Process horizontal cuts of the textline."""
1007+ cut_text : list [tuple [int , int , LTChar | LTAnno | list [Any ]]] = []
1008+ x_overlap = [
1009+ i for i , x in enumerate (table .cols ) if x [0 ] <= bbox [2 ] and bbox [0 ] <= x [1 ]
1010+ ]
1011+ r_idx = [
1012+ j for j , r in enumerate (table .rows ) if r [1 ] <= (bbox [1 ] + bbox [3 ]) / 2 <= r [0 ]
1013+ ]
1014+
1015+ if not r_idx :
1016+ return cut_text
1017+
1018+ r = r_idx [0 ]
1019+ x_cuts = [
1020+ (c , table .cells [r ][c ].x2 ) for c in x_overlap if table .cells [r ][c ].right
1021+ ] or [(x_overlap [0 ], table .cells [r ][- 1 ].x2 )]
1022+
1023+ for obj in textline ._objs :
1024+ row = table .rows [r ]
1025+ for cut in x_cuts :
1026+ if (
1027+ isinstance (obj , LTChar )
1028+ and row [1 ] <= (obj .y0 + obj .y1 ) / 2 <= row [0 ]
1029+ and (obj .x0 + obj .x1 ) / 2 <= cut [1 ]
1030+ ):
1031+ cut_text .append ((r , cut [0 ], obj ))
1032+ break
1033+ elif isinstance (obj , LTAnno ):
1034+ cut_text .append ((r , cut [0 ], obj ))
1035+ return cut_text
1036+
1037+
1038+ def _process_vertical_cut (
1039+ table , textline , bbox
1040+ ) -> list [tuple [int , int , LTChar | LTAnno | list [Any ]]]:
1041+ """Process vertical cuts of the textline."""
1042+ cut_text : list [tuple [int , int , LTChar | LTAnno | list [Any ]]] = []
1043+ y_overlap = [
1044+ j for j , y in enumerate (table .rows ) if y [1 ] <= bbox [3 ] and bbox [1 ] <= y [0 ]
1045+ ]
1046+ c_idx = [
1047+ i for i , c in enumerate (table .cols ) if c [0 ] <= (bbox [0 ] + bbox [2 ]) / 2 <= c [1 ]
1048+ ]
1049+
1050+ if not c_idx :
1051+ return cut_text
1052+
1053+ c = c_idx [0 ]
1054+ y_cuts = [
1055+ (r , table .cells [r ][c ].y1 ) for r in y_overlap if table .cells [r ][c ].bottom
1056+ ] or [(y_overlap [0 ], table .cells [- 1 ][c ].y1 )]
1057+
1058+ for obj in textline ._objs :
1059+ col = table .cols [c ]
1060+ for cut in y_cuts :
1061+ if (
1062+ isinstance (obj , LTChar )
1063+ and col [0 ] <= (obj .x0 + obj .x1 ) / 2 <= col [1 ]
1064+ and (obj .y0 + obj .y1 ) / 2 >= cut [1 ]
1065+ ):
1066+ cut_text .append ((cut [0 ], c , obj ))
1067+ break
1068+ elif isinstance (obj , LTAnno ):
1069+ cut_text .append ((cut [0 ], c , obj ))
1070+ return cut_text
1071+
1072+
1073+ def _group_and_process_chars (
1074+ cut_text : list [tuple [int , int , LTChar | LTAnno | list [Any ]]],
1075+ flag_size : bool ,
1076+ direction : str ,
1077+ strip_text : str ,
1078+ ): # -> List[Tuple[int, int, str]]
1079+ """Group characters and process them based on size flag."""
1080+ grouped_chars : list [tuple [int , int , str ]] = [] # LTChar
10561081 for key , chars in groupby (cut_text , itemgetter (0 , 1 )):
10571082 if flag_size :
10581083 grouped_chars .append (
@@ -1065,7 +1090,7 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
10651090 )
10661091 )
10671092 else :
1068- gchars = [t [2 ].get_text () for t in chars ]
1093+ gchars = [t [2 ].get_text () for t in chars ] # .get_text()
10691094 grouped_chars .append (
10701095 (key [0 ], key [1 ], text_strip ("" .join (gchars ), strip_text ))
10711096 )
0 commit comments