diff --git a/camelot/utils.py b/camelot/utils.py index 6b7eefd..9622917 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1174,28 +1174,27 @@ def _group_and_process_chars( def get_table_index( table, t, direction, split_text=False, flag_size=False, strip_text="" ): - """Get indices of the table cell. + """ + Get indices of the table cell. - Get the index of a table cell where given text object lies by + Get the index of a table cell where a given text object lies by comparing their y and x-coordinates. Parameters ---------- table : camelot.core.Table + The table structure containing rows and columns. t : object PDFMiner LTTextLine object. direction : string Direction of the PDFMiner LTTextLine object. split_text : bool, optional (default: False) - Whether or not to split a text line if it spans across - multiple cells. + Whether or not to split a text line if it spans across multiple cells. flag_size : bool, optional (default: False) - Whether or not to highlight a substring using - if its size is different from rest of the string. (Useful for - super and subscripts) + Whether or not to highlight a substring using if its size + is different from the rest of the string. strip_text : str, optional (default: '') - Characters that should be stripped from a string before - assigning it to a cell. + Characters that should be stripped from a string before assigning it to a cell. Returns ------- @@ -1203,75 +1202,110 @@ def get_table_index( List of tuples of the form (r_idx, c_idx, text) where r_idx and c_idx are row and column indices. error : float - Assignment error, percentage of text area that lies outside - a cell. - +-------+ - | | - | [Text bounding box] - | | - +-------+ - + Assignment error, percentage of text area that lies outside a cell. """ - r_idx, c_idx = [-1] * 2 - for r in range(len(table.rows)): - if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[ - r - ][1]: - lt_col_overlap = [] - for c in table.cols: - if c[0] <= t.x1 and c[1] >= t.x0: - left = t.x0 if c[0] <= t.x0 else c[0] - right = t.x1 if c[1] >= t.x1 else c[1] - lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1])) - else: - lt_col_overlap.append(-1) - if len(list(filter(lambda x: x != -1, lt_col_overlap))) == 0: - text = t.get_text().strip("\n") - text_range = (t.x0, t.x1) - col_range = (table.cols[0][0], table.cols[-1][1]) - warnings.warn( - f"{text} {text_range} does not lie in column range {col_range}" - ) - r_idx = r - c_idx = lt_col_overlap.index(max(lt_col_overlap)) - break - - # error calculation - y0_offset, y1_offset, x0_offset, x1_offset = [0] * 4 - if t.y0 > table.rows[r_idx][0]: - y0_offset = abs(t.y0 - table.rows[r_idx][0]) - if t.y1 < table.rows[r_idx][1]: - y1_offset = abs(t.y1 - table.rows[r_idx][1]) - if t.x0 < table.cols[c_idx][0]: - x0_offset = abs(t.x0 - table.cols[c_idx][0]) - if t.x1 > table.cols[c_idx][1]: - x1_offset = abs(t.x1 - table.cols[c_idx][1]) - x = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) - y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) - charea = x * y - error = ((x * (y0_offset + y1_offset)) + (y * (x0_offset + x1_offset))) / charea + r_idx, c_idx = find_row_index(table, t) + if r_idx == -1: + return [], 0.0 # No valid row found + + lt_col_overlap = calculate_column_overlap(table, t, r_idx) + c_idx = lt_col_overlap.index(max(lt_col_overlap)) + + error = calculate_assignment_error(t, table, r_idx, c_idx) if split_text: return ( - split_textline( - table, t, direction, flag_size=flag_size, strip_text=strip_text - ), + split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error, ) + + text = t.get_text().strip("\n") + if flag_size: + return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error else: - if flag_size: - return ( - [ - ( - r_idx, - c_idx, - flag_font_size(t._objs, direction, strip_text=strip_text), - ) - ], - error, - ) + return [(r_idx, c_idx, text_strip(text, strip_text))], error + +def find_row_index(table, t): + """ + Find the row index for the given text object. + + Parameters + ---------- + table : camelot.core.Table + The table structure containing rows and columns. + t : object + PDFMiner LTTextLine object. + + Returns + ------- + int + The row index where the text object is located or -1 if not found. + """ + mid_y = (t.y0 + t.y1) / 2.0 + for r in range(len(table.rows)): + if mid_y < table.rows[r][0] and mid_y > table.rows[r][1]: + return r + return -1 # No valid row found + +def calculate_column_overlap(table, t, r_idx): + """ + Calculate column overlap for the given row index and text object. + + Parameters + ---------- + table : camelot.core.Table + The table structure containing rows and columns. + t : object + PDFMiner LTTextLine object. + r_idx : int + Row index where the text object is located. + + Returns + ------- + list + List of column overlap values. + """ + lt_col_overlap = [] + for c in table.cols: + if c[0] <= t.x1 and c[1] >= t.x0: + left = max(t.x0, c[0]) + right = min(t.x1, c[1]) + overlap = abs(left - right) / abs(c[0] - c[1]) + lt_col_overlap.append(overlap) else: - return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error + lt_col_overlap.append(-1) + return lt_col_overlap + +def calculate_assignment_error(t, table, r_idx, c_idx): + """ + Calculate the assignment error for the given text object. + + Parameters + ---------- + t : object + PDFMiner LTTextLine object. + table : camelot.core.Table + The table structure containing rows and columns. + r_idx : int + Row index where the text object is located. + c_idx : int + Column index where the text object is located. + + Returns + ------- + float + The calculated assignment error. + """ + y0_offset = max(0, t.y0 - table.rows[r_idx][0]) + y1_offset = max(0, table.rows[r_idx][1] - t.y1) + x0_offset = max(0, t.x0 - table.cols[c_idx][0]) + x1_offset = max(0, t.x1 - table.cols[c_idx][1]) + + x = max(1.0, abs(t.x0 - t.x1)) + y = max(1.0, abs(t.y0 - t.y1)) + charea = x * y + error = ((x * (y0_offset + y1_offset)) + (y * (x0_offset + x1_offset))) / charea + return error def compute_accuracy(error_weights):