@@ -958,7 +958,6 @@ def flag_font_size(
958958 str
959959 The processed string with flagged super/subscripts.
960960 """
961-
962961 # Determine size based on direction and collect text and size
963962 d : list [tuple [str , float ]] = []
964963 if direction == "horizontal" :
@@ -1174,46 +1173,43 @@ def _group_and_process_chars(
11741173def get_table_index (
11751174 table , t , direction , split_text = False , flag_size = False , strip_text = ""
11761175):
1177- """Get indices of the table cell.
1176+ """
1177+ Get indices of the table cell.
11781178
1179- Get the index of a table cell where given text object lies by
1179+ Get the index of a table cell where a given text object lies by
11801180 comparing their y and x-coordinates.
11811181
11821182 Parameters
11831183 ----------
11841184 table : camelot.core.Table
1185+ The table structure containing rows and columns.
11851186 t : object
11861187 PDFMiner LTTextLine object.
11871188 direction : string
11881189 Direction of the PDFMiner LTTextLine object.
11891190 split_text : bool, optional (default: False)
1190- Whether or not to split a text line if it spans across
1191- multiple cells.
1191+ Whether or not to split a text line if it spans across multiple cells.
11921192 flag_size : bool, optional (default: False)
1193- Whether or not to highlight a substring using <s></s>
1194- if its size is different from rest of the string. (Useful for
1195- super and subscripts)
1193+ Whether to highlight a substring using <s></s> if its size is different
1194+ from the rest of the string.
11961195 strip_text : str, optional (default: '')
1197- Characters that should be stripped from a string before
1198- assigning it to a cell.
1196+ Characters that should be stripped from a string before assigning it to a cell.
11991197
12001198 Returns
12011199 -------
1202- indices : list
1203- List of tuples of the form (r_idx, c_idx, text) where r_idx
1204- and c_idx are row and column indices.
1205- error : float
1206- Assignment error, percentage of text area that lies outside
1207- a cell.
1200+ list
1201+ List of tuples of the form (r_idx, c_idx, text) where r_idx and c_idx
1202+ are row and column indices, respectively.
1203+ float
1204+ Assignment error, percentage of text area that lies outside a cell.
12081205 +-------+
12091206 | |
12101207 | [Text bounding box]
12111208 | |
12121209 +-------+
1213-
12141210 """
12151211 r_idx , c_idx = [- 1 ] * 2
1216- for r in range (len (table .rows )):
1212+ for r in range (len (table .rows )): # noqa
12171213 if (t .y0 + t .y1 ) / 2.0 < table .rows [r ][0 ] and (t .y0 + t .y1 ) / 2.0 > table .rows [
12181214 r
12191215 ][1 ]:
@@ -1230,13 +1226,53 @@ def get_table_index(
12301226 text_range = (t .x0 , t .x1 )
12311227 col_range = (table .cols [0 ][0 ], table .cols [- 1 ][1 ])
12321228 warnings .warn (
1233- f"{ text } { text_range } does not lie in column range { col_range } "
1229+ f"{ text } { text_range } does not lie in column range { col_range } " ,
1230+ stacklevel = 1 ,
12341231 )
12351232 r_idx = r
12361233 c_idx = lt_col_overlap .index (max (lt_col_overlap ))
12371234 break
1235+ if r_idx == - 1 :
1236+ return [], 1.0 # Return early if no valid row is found
1237+
1238+ error = calculate_assignment_error (t , table , r_idx , c_idx )
12381239
1239- # error calculation
1240+ if split_text :
1241+ return (
1242+ split_textline (
1243+ table , t , direction , flag_size = flag_size , strip_text = strip_text
1244+ ),
1245+ error ,
1246+ )
1247+ text = t .get_text ().strip ("\n " )
1248+ if flag_size :
1249+ return [
1250+ (r_idx , c_idx , flag_font_size (t ._objs , direction , strip_text = strip_text ))
1251+ ], error
1252+ else :
1253+ return [(r_idx , c_idx , text_strip (t .get_text (), strip_text ))], error
1254+
1255+
1256+ def calculate_assignment_error (t , table , r_idx , c_idx ):
1257+ """
1258+ Calculate the assignment error for the given text object.
1259+
1260+ Parameters
1261+ ----------
1262+ t : object
1263+ PDFMiner LTTextLine object.
1264+ table : camelot.core.Table
1265+ The table structure containing rows and columns.
1266+ r_idx : int
1267+ Row index where the text object is located.
1268+ c_idx : int
1269+ Column index where the text object is located.
1270+
1271+ Returns
1272+ -------
1273+ float
1274+ The calculated assignment error.
1275+ """
12401276 y0_offset , y1_offset , x0_offset , x1_offset = [0 ] * 4
12411277 if t .y0 > table .rows [r_idx ][0 ]:
12421278 y0_offset = abs (t .y0 - table .rows [r_idx ][0 ])
@@ -1246,32 +1282,13 @@ def get_table_index(
12461282 x0_offset = abs (t .x0 - table .cols [c_idx ][0 ])
12471283 if t .x1 > table .cols [c_idx ][1 ]:
12481284 x1_offset = abs (t .x1 - table .cols [c_idx ][1 ])
1285+
12491286 x = 1.0 if abs (t .x0 - t .x1 ) == 0.0 else abs (t .x0 - t .x1 )
12501287 y = 1.0 if abs (t .y0 - t .y1 ) == 0.0 else abs (t .y0 - t .y1 )
1288+
12511289 charea = x * y
12521290 error = ((x * (y0_offset + y1_offset )) + (y * (x0_offset + x1_offset ))) / charea
1253-
1254- if split_text :
1255- return (
1256- split_textline (
1257- table , t , direction , flag_size = flag_size , strip_text = strip_text
1258- ),
1259- error ,
1260- )
1261- else :
1262- if flag_size :
1263- return (
1264- [
1265- (
1266- r_idx ,
1267- c_idx ,
1268- flag_font_size (t ._objs , direction , strip_text = strip_text ),
1269- )
1270- ],
1271- error ,
1272- )
1273- else :
1274- return [(r_idx , c_idx , text_strip (t .get_text (), strip_text ))], error
1291+ return error
12751292
12761293
12771294def compute_accuracy (error_weights ):
0 commit comments