Skip to content
This repository was archived by the owner on Apr 11, 2025. It is now read-only.

Commit 2336e2a

Browse files
committed
[REF]: get_table_index
1 parent ff9a501 commit 2336e2a

File tree

1 file changed

+59
-42
lines changed

1 file changed

+59
-42
lines changed

camelot/utils.py

Lines changed: 59 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,6 @@ def flag_font_size(
958958
str
959959
The processed string with flagged super/subscripts.
960960
"""
961-
962961
# Determine size based on direction and collect text and size
963962
d: list[tuple[str, float]] = []
964963
if direction == "horizontal":
@@ -1174,46 +1173,43 @@ def _group_and_process_chars(
11741173
def get_table_index(
11751174
table, t, direction, split_text=False, flag_size=False, strip_text=""
11761175
):
1177-
"""Get indices of the table cell.
1176+
"""
1177+
Get indices of the table cell.
11781178
1179-
Get the index of a table cell where given text object lies by
1179+
Get the index of a table cell where a given text object lies by
11801180
comparing their y and x-coordinates.
11811181
11821182
Parameters
11831183
----------
11841184
table : camelot.core.Table
1185+
The table structure containing rows and columns.
11851186
t : object
11861187
PDFMiner LTTextLine object.
11871188
direction : string
11881189
Direction of the PDFMiner LTTextLine object.
11891190
split_text : bool, optional (default: False)
1190-
Whether or not to split a text line if it spans across
1191-
multiple cells.
1191+
Whether or not to split a text line if it spans across multiple cells.
11921192
flag_size : bool, optional (default: False)
1193-
Whether or not to highlight a substring using <s></s>
1194-
if its size is different from rest of the string. (Useful for
1195-
super and subscripts)
1193+
Whether to highlight a substring using <s></s> if its size is different
1194+
from the rest of the string.
11961195
strip_text : str, optional (default: '')
1197-
Characters that should be stripped from a string before
1198-
assigning it to a cell.
1196+
Characters that should be stripped from a string before assigning it to a cell.
11991197
12001198
Returns
12011199
-------
1202-
indices : list
1203-
List of tuples of the form (r_idx, c_idx, text) where r_idx
1204-
and c_idx are row and column indices.
1205-
error : float
1206-
Assignment error, percentage of text area that lies outside
1207-
a cell.
1200+
list
1201+
List of tuples of the form (r_idx, c_idx, text) where r_idx and c_idx
1202+
are row and column indices, respectively.
1203+
float
1204+
Assignment error, percentage of text area that lies outside a cell.
12081205
+-------+
12091206
| |
12101207
| [Text bounding box]
12111208
| |
12121209
+-------+
1213-
12141210
"""
12151211
r_idx, c_idx = [-1] * 2
1216-
for r in range(len(table.rows)):
1212+
for r in range(len(table.rows)): # noqa
12171213
if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
12181214
r
12191215
][1]:
@@ -1230,13 +1226,53 @@ def get_table_index(
12301226
text_range = (t.x0, t.x1)
12311227
col_range = (table.cols[0][0], table.cols[-1][1])
12321228
warnings.warn(
1233-
f"{text} {text_range} does not lie in column range {col_range}"
1229+
f"{text} {text_range} does not lie in column range {col_range}",
1230+
stacklevel=1,
12341231
)
12351232
r_idx = r
12361233
c_idx = lt_col_overlap.index(max(lt_col_overlap))
12371234
break
1235+
if r_idx == -1:
1236+
return [], 1.0 # Return early if no valid row is found
1237+
1238+
error = calculate_assignment_error(t, table, r_idx, c_idx)
12381239

1239-
# error calculation
1240+
if split_text:
1241+
return (
1242+
split_textline(
1243+
table, t, direction, flag_size=flag_size, strip_text=strip_text
1244+
),
1245+
error,
1246+
)
1247+
text = t.get_text().strip("\n")
1248+
if flag_size:
1249+
return [
1250+
(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))
1251+
], error
1252+
else:
1253+
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
1254+
1255+
1256+
def calculate_assignment_error(t, table, r_idx, c_idx):
1257+
"""
1258+
Calculate the assignment error for the given text object.
1259+
1260+
Parameters
1261+
----------
1262+
t : object
1263+
PDFMiner LTTextLine object.
1264+
table : camelot.core.Table
1265+
The table structure containing rows and columns.
1266+
r_idx : int
1267+
Row index where the text object is located.
1268+
c_idx : int
1269+
Column index where the text object is located.
1270+
1271+
Returns
1272+
-------
1273+
float
1274+
The calculated assignment error.
1275+
"""
12401276
y0_offset, y1_offset, x0_offset, x1_offset = [0] * 4
12411277
if t.y0 > table.rows[r_idx][0]:
12421278
y0_offset = abs(t.y0 - table.rows[r_idx][0])
@@ -1246,32 +1282,13 @@ def get_table_index(
12461282
x0_offset = abs(t.x0 - table.cols[c_idx][0])
12471283
if t.x1 > table.cols[c_idx][1]:
12481284
x1_offset = abs(t.x1 - table.cols[c_idx][1])
1285+
12491286
x = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
12501287
y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
1288+
12511289
charea = x * y
12521290
error = ((x * (y0_offset + y1_offset)) + (y * (x0_offset + x1_offset))) / charea
1253-
1254-
if split_text:
1255-
return (
1256-
split_textline(
1257-
table, t, direction, flag_size=flag_size, strip_text=strip_text
1258-
),
1259-
error,
1260-
)
1261-
else:
1262-
if flag_size:
1263-
return (
1264-
[
1265-
(
1266-
r_idx,
1267-
c_idx,
1268-
flag_font_size(t._objs, direction, strip_text=strip_text),
1269-
)
1270-
],
1271-
error,
1272-
)
1273-
else:
1274-
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
1291+
return error
12751292

12761293

12771294
def compute_accuracy(error_weights):

0 commit comments

Comments
 (0)