diff --git a/camelot/utils.py b/camelot/utils.py
index aab149e..8d138cf 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -1,5 +1,7 @@
"""General helper utilities to parse the pdf tables."""
+from __future__ import annotations
+
import atexit
import math
import os
@@ -12,6 +14,9 @@
from itertools import groupby
from operator import itemgetter
from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Tuple
from typing import Union
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc
@@ -26,6 +31,7 @@
from pdfminer.layout import LTAnno
from pdfminer.layout import LTChar
from pdfminer.layout import LTImage
+from pdfminer.layout import LTTextLine
from pdfminer.layout import LTTextLineHorizontal
from pdfminer.layout import LTTextLineVertical
from pdfminer.pdfdocument import PDFDocument
@@ -83,7 +89,7 @@ def random_string(length):
return ret
-def download_url(url: str) -> Union[StrByteType, Path]:
+def download_url(url: str) -> StrByteType | Path:
"""Download file from specified URL.
Parameters
@@ -951,124 +957,183 @@ def flag_font_size(textline, direction, strip_text=""):
return text_strip(fstring, strip_text)
-def split_textline(table, textline, direction, flag_size=False, strip_text=""):
+def split_textline(
+ table: Any,
+ textline: LTTextLine,
+ direction: str,
+ flag_size: bool = False,
+ strip_text: str = "",
+) -> list[tuple[int, int, str]]:
"""Split textline into substrings if it spans across multiple rows/columns.
Parameters
----------
table : camelot.core.Table
- textline : object
+ The table structure containing rows and columns.
+ textline : LTTextLine
PDFMiner LTTextLine object.
- direction : string
- Direction of the PDFMiner LTTextLine object.
- flag_size : bool, optional (default: False)
- Whether or not to highlight a substring using
- if its size is different from rest of the string. (Useful for
- super and subscripts.)
- strip_text : str, optional (default: '')
- Characters that should be stripped from a string before
- assigning it to a cell.
+ direction : str
+ Direction of the PDFMiner LTTextLine object, either "horizontal" or "vertical".
+ flag_size : bool, optional
+ Whether to highlight a substring using if its size differs from the rest of the string.
+ strip_text : str, optional
+ Characters to strip from a string before assigning it to a cell.
Returns
-------
- grouped_chars : list
- List of tuples of the form (idx, text) where idx is the index
- of row/column and text is the an LTTextLine substring.
-
+ List[tuple[int, int, str]]
+ A list of tuples of the form (idx, text) where idx is the index of row/column
+ and text is an LTTextLine substring.
"""
- cut_text = []
+ cut_text: list[tuple[int, int, LTChar | LTAnno | list[Any]]] = []
bbox = textline.bbox
- try:
- if textline.is_empty():
- return [(-1, -1, textline.get_text())]
-
- if direction == "horizontal" and not textline.is_empty():
- x_overlap = [
- i
- for i, x in enumerate(table.cols)
- if x[0] <= bbox[2] and bbox[0] <= x[1]
- ]
- r_idx = [
- j
- for j, r in enumerate(table.rows)
- if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]
- ]
- r = r_idx[0]
- x_cuts = [
- (c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
- ]
- if not x_cuts:
- x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
- for obj in textline._objs:
- row = table.rows[r]
- for cut in x_cuts:
- if isinstance(obj, LTChar):
- if (
- row[1] <= (obj.y0 + obj.y1) / 2 <= row[0]
- and (obj.x0 + obj.x1) / 2 <= cut[1]
- ):
- cut_text.append((r, cut[0], obj))
- break
- else:
- # TODO: add test
- if cut == x_cuts[-1]:
- new_idx = min(cut[0] + 1, len(table.cols) - 1)
- cut_text.append((r, new_idx, obj))
- elif isinstance(obj, LTAnno):
- cut_text.append((r, cut[0], obj))
- elif direction == "vertical" and not textline.is_empty():
- y_overlap = [
- j
- for j, y in enumerate(table.rows)
- if y[1] <= bbox[3] and bbox[1] <= y[0]
- ]
- c_idx = [
- i
- for i, c in enumerate(table.cols)
- if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]
- ]
- c = c_idx[0]
- y_cuts = [
- (r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
- ]
- if not y_cuts:
- y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
- for obj in textline._objs:
- col = table.cols[c]
- for cut in y_cuts:
- if isinstance(obj, LTChar):
- if (
- col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
- and (obj.y0 + obj.y1) / 2 >= cut[1]
- ):
- cut_text.append((cut[0], c, obj))
- break
- else:
- # TODO: add test
- if cut == y_cuts[-1]:
- new_idx = max(cut[0] - 1, 0)
- cut_text.append((new_idx, c, obj))
- elif isinstance(obj, LTAnno):
- cut_text.append((cut[0], c, obj))
- except IndexError:
+
+ if textline.is_empty():
return [(-1, -1, textline.get_text())]
- grouped_chars = []
+
+ if direction == "horizontal":
+ cut_text = _process_horizontal_cut(table, textline, bbox)
+ elif direction == "vertical":
+ cut_text = _process_vertical_cut(table, textline, bbox)
+
+ grouped_chars = _group_and_process_chars(cut_text, flag_size, direction, strip_text)
+ return grouped_chars
+
+
+def _process_horizontal_cut(
+ table, textline, bbox
+) -> list[tuple[int, int, LTChar | LTAnno | list[Any]]]:
+ """Process horizontal cuts of the textline."""
+ cut_text: list[tuple[int, int, LTChar | LTAnno | list[Any]]] = []
+ x_overlap = [
+ i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]
+ ]
+ r_idx = [
+ j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]
+ ]
+
+ if not r_idx:
+ return cut_text
+
+ r = r_idx[0]
+ x_cuts = [
+ (c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
+ ] or [(x_overlap[0], table.cells[r][-1].x2)]
+
+ for obj in textline._objs:
+ row = table.rows[r]
+ for cut in x_cuts:
+ if (
+ isinstance(obj, LTChar)
+ and row[1] <= (obj.y0 + obj.y1) / 2 <= row[0]
+ and (obj.x0 + obj.x1) / 2 <= cut[1]
+ ):
+ cut_text.append((r, cut[0], obj))
+ break
+ elif isinstance(obj, LTAnno):
+ cut_text.append((r, cut[0], obj))
+ return cut_text
+
+
+def _process_vertical_cut(
+ table, textline, bbox
+) -> list[tuple[int, int, LTChar | LTAnno | list[Any]]]:
+ """Process vertical cuts of the textline."""
+ cut_text: list[tuple[int, int, LTChar | LTAnno | list[Any]]] = []
+ y_overlap = [
+ j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]
+ ]
+ c_idx = [
+ i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]
+ ]
+
+ if not c_idx:
+ return cut_text
+
+ c = c_idx[0]
+ y_cuts = [
+ (r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
+ ] or [(y_overlap[0], table.cells[-1][c].y1)]
+
+ for obj in textline._objs:
+ col = table.cols[c]
+ for cut in y_cuts:
+ if (
+ isinstance(obj, LTChar)
+ and col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
+ and (obj.y0 + obj.y1) / 2 >= cut[1]
+ ):
+ cut_text.append((cut[0], c, obj))
+ break
+ elif isinstance(obj, LTAnno):
+ cut_text.append((cut[0], c, obj))
+ return cut_text
+
+
+def _group_and_process_chars(
+ cut_text: list[tuple[int, int, LTChar | LTAnno | list[Any]]],
+ flag_size: bool,
+ direction: str,
+ strip_text: str,
+) -> list[tuple[int, int, str]]:
+ """
+ Group characters and process them based on size flag.
+
+ Parameters
+ ----------
+ cut_text : list of tuples
+ Each tuple consists of (x0, y0, character), where x0 and y0 are
+ coordinates and character can be an instance of LTChar, LTAnno,
+ or a list of any type.
+
+ flag_size : bool
+ A flag indicating whether to group by font size.
+
+ direction : str
+ Direction for processing the text (e.g., 'horizontal' or 'vertical').
+
+ strip_text : str
+ Characters to strip from the text.
+
+ Returns
+ -------
+ list of tuples
+ Each tuple consists of (x0, y0, processed_text), where processed_text
+ is the grouped and processed text based on the specified conditions.
+ """
+ grouped_chars: list[tuple[int, int, str]] = []
+
for key, chars in groupby(cut_text, itemgetter(0, 1)):
+ chars_list = list(chars) # Convert the iterator to a list to reuse it
+
if flag_size:
grouped_chars.append(
(
key[0],
key[1],
flag_font_size(
- [t[2] for t in chars], direction, strip_text=strip_text
+ [t[2] for t in chars_list], direction, strip_text=strip_text
),
)
)
else:
- gchars = [t[2].get_text() for t in chars]
+ # Check types before calling get_text
+ gchars = []
+ for t in chars_list:
+ if isinstance(
+ t[2], (LTChar, LTAnno)
+ ): # Ensure it's one of the expected types
+ gchars.append(t[2].get_text()) # Call get_text() safely
+ else:
+ # Handle the case where t[2] is a list or other type
+ gchars.extend(
+ t[2]
+ ) # Assuming it's iterable and we want to extend the list
+
grouped_chars.append(
(key[0], key[1], text_strip("".join(gchars), strip_text))
)
+
return grouped_chars