Skip to content
This repository was archived by the owner on Apr 2, 2025. It is now read-only.
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 158 additions & 93 deletions camelot/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""General helper utilities to parse the pdf tables."""

from __future__ import annotations

import atexit
import math
import os
Expand All @@ -12,6 +14,9 @@
from itertools import groupby
from operator import itemgetter
from pathlib import Path
from typing import Any
from typing import List
from typing import Tuple
from typing import Union
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc
Expand All @@ -26,6 +31,7 @@
from pdfminer.layout import LTAnno
from pdfminer.layout import LTChar
from pdfminer.layout import LTImage
from pdfminer.layout import LTTextLine
from pdfminer.layout import LTTextLineHorizontal
from pdfminer.layout import LTTextLineVertical
from pdfminer.pdfdocument import PDFDocument
Expand Down Expand Up @@ -83,7 +89,7 @@ def random_string(length):
return ret


def download_url(url: str) -> Union[StrByteType, Path]:
def download_url(url: str) -> StrByteType | Path:
"""Download file from specified URL.

Parameters
Expand Down Expand Up @@ -951,124 +957,183 @@ def flag_font_size(textline, direction, strip_text=""):
return text_strip(fstring, strip_text)


def split_textline(table, textline, direction, flag_size=False, strip_text=""):
def split_textline(
table: Any,
textline: LTTextLine,
direction: str,
flag_size: bool = False,
strip_text: str = "",
) -> list[tuple[int, int, str]]:
"""Split textline into substrings if it spans across multiple rows/columns.

Parameters
----------
table : camelot.core.Table
textline : object
The table structure containing rows and columns.
textline : LTTextLine
PDFMiner LTTextLine object.
direction : string
Direction of the PDFMiner LTTextLine object.
flag_size : bool, optional (default: False)
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for
super and subscripts.)
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
direction : str
Direction of the PDFMiner LTTextLine object, either "horizontal" or "vertical".
flag_size : bool, optional
Whether to highlight a substring using <s></s> if its size differs from the rest of the string.
strip_text : str, optional
Characters to strip from a string before assigning it to a cell.

Returns
-------
grouped_chars : list
List of tuples of the form (idx, text) where idx is the index
of row/column and text is the an LTTextLine substring.

List[tuple[int, int, str]]
A list of tuples of the form (idx, text) where idx is the index of row/column
and text is an LTTextLine substring.
"""
cut_text = []
cut_text: list[tuple[int, int, LTChar | LTAnno | list[Any]]] = []
bbox = textline.bbox
try:
if textline.is_empty():
return [(-1, -1, textline.get_text())]

if direction == "horizontal" and not textline.is_empty():
x_overlap = [
i
for i, x in enumerate(table.cols)
if x[0] <= bbox[2] and bbox[0] <= x[1]
]
r_idx = [
j
for j, r in enumerate(table.rows)
if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]
]
r = r_idx[0]
x_cuts = [
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
]
if not x_cuts:
x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
for obj in textline._objs:
row = table.rows[r]
for cut in x_cuts:
if isinstance(obj, LTChar):
if (
row[1] <= (obj.y0 + obj.y1) / 2 <= row[0]
and (obj.x0 + obj.x1) / 2 <= cut[1]
):
cut_text.append((r, cut[0], obj))
break
else:
# TODO: add test
if cut == x_cuts[-1]:
new_idx = min(cut[0] + 1, len(table.cols) - 1)
cut_text.append((r, new_idx, obj))
elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj))
elif direction == "vertical" and not textline.is_empty():
y_overlap = [
j
for j, y in enumerate(table.rows)
if y[1] <= bbox[3] and bbox[1] <= y[0]
]
c_idx = [
i
for i, c in enumerate(table.cols)
if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]
]
c = c_idx[0]
y_cuts = [
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
]
if not y_cuts:
y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
for obj in textline._objs:
col = table.cols[c]
for cut in y_cuts:
if isinstance(obj, LTChar):
if (
col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
and (obj.y0 + obj.y1) / 2 >= cut[1]
):
cut_text.append((cut[0], c, obj))
break
else:
# TODO: add test
if cut == y_cuts[-1]:
new_idx = max(cut[0] - 1, 0)
cut_text.append((new_idx, c, obj))
elif isinstance(obj, LTAnno):
cut_text.append((cut[0], c, obj))
except IndexError:

if textline.is_empty():
return [(-1, -1, textline.get_text())]
grouped_chars = []

if direction == "horizontal":
cut_text = _process_horizontal_cut(table, textline, bbox)
elif direction == "vertical":
cut_text = _process_vertical_cut(table, textline, bbox)

grouped_chars = _group_and_process_chars(cut_text, flag_size, direction, strip_text)
return grouped_chars


def _process_horizontal_cut(
table, textline, bbox
) -> list[tuple[int, int, LTChar | LTAnno | list[Any]]]:
"""Process horizontal cuts of the textline."""
cut_text: list[tuple[int, int, LTChar | LTAnno | list[Any]]] = []
x_overlap = [
i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]
]
r_idx = [
j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]
]

if not r_idx:
return cut_text

r = r_idx[0]
x_cuts = [
(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
] or [(x_overlap[0], table.cells[r][-1].x2)]

for obj in textline._objs:
row = table.rows[r]
for cut in x_cuts:
if (
isinstance(obj, LTChar)
and row[1] <= (obj.y0 + obj.y1) / 2 <= row[0]
and (obj.x0 + obj.x1) / 2 <= cut[1]
):
cut_text.append((r, cut[0], obj))
break
elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj))
return cut_text


def _process_vertical_cut(
table, textline, bbox
) -> list[tuple[int, int, LTChar | LTAnno | list[Any]]]:
"""Process vertical cuts of the textline."""
cut_text: list[tuple[int, int, LTChar | LTAnno | list[Any]]] = []
y_overlap = [
j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]
]
c_idx = [
i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]
]

if not c_idx:
return cut_text

c = c_idx[0]
y_cuts = [
(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
] or [(y_overlap[0], table.cells[-1][c].y1)]

for obj in textline._objs:
col = table.cols[c]
for cut in y_cuts:
if (
isinstance(obj, LTChar)
and col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
and (obj.y0 + obj.y1) / 2 >= cut[1]
):
cut_text.append((cut[0], c, obj))
break
elif isinstance(obj, LTAnno):
cut_text.append((cut[0], c, obj))
return cut_text


def _group_and_process_chars(
cut_text: list[tuple[int, int, LTChar | LTAnno | list[Any]]],
flag_size: bool,
direction: str,
strip_text: str,
) -> list[tuple[int, int, str]]:
"""
Group characters and process them based on size flag.

Parameters
----------
cut_text : list of tuples
Each tuple consists of (x0, y0, character), where x0 and y0 are
coordinates and character can be an instance of LTChar, LTAnno,
or a list of any type.

flag_size : bool
A flag indicating whether to group by font size.

direction : str
Direction for processing the text (e.g., 'horizontal' or 'vertical').

strip_text : str
Characters to strip from the text.

Returns
-------
list of tuples
Each tuple consists of (x0, y0, processed_text), where processed_text
is the grouped and processed text based on the specified conditions.
"""
grouped_chars: list[tuple[int, int, str]] = []

for key, chars in groupby(cut_text, itemgetter(0, 1)):
chars_list = list(chars) # Convert the iterator to a list to reuse it

if flag_size:
grouped_chars.append(
(
key[0],
key[1],
flag_font_size(
[t[2] for t in chars], direction, strip_text=strip_text
[t[2] for t in chars_list], direction, strip_text=strip_text
),
)
)
else:
gchars = [t[2].get_text() for t in chars]
# Check types before calling get_text
gchars = []
for t in chars_list:
if isinstance(
t[2], (LTChar, LTAnno)
): # Ensure it's one of the expected types
gchars.append(t[2].get_text()) # Call get_text() safely
else:
# Handle the case where t[2] is a list or other type
gchars.extend(
t[2]
) # Assuming it's iterable and we want to extend the list

grouped_chars.append(
(key[0], key[1], text_strip("".join(gchars), strip_text))
)

return grouped_chars


Expand Down
Loading