Skip to content
This repository was archived by the owner on Apr 2, 2025. It is now read-only.
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 59 additions & 22 deletions camelot/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,15 @@
import zipfile
from operator import itemgetter

import cv2
import numpy as np
import pandas as pd

from .backends import ImageConversionBackend
from .utils import build_file_path_in_temp_dir
from .utils import compute_accuracy
from .utils import compute_whitespace


# minimum number of vertical textline intersections for a textedge
# to be considered valid
Expand Down Expand Up @@ -149,7 +155,7 @@ def get_relevant(self):
# get vertical textedges that intersect maximum number of
# times with horizontal textlines
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
return self._textedges[relevant_align]
return list(filter(lambda te: te.is_valid, self._textedges[relevant_align]))

def get_table_areas(self, textlines, relevant_textedges):
"""Returns a dict of interesting table areas on the PDF page
Expand All @@ -169,27 +175,26 @@ def pad(area, average_row_height):

table_areas = {}
for te in relevant_textedges:
if te.is_valid:
if not table_areas:
if not table_areas:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
found = None
for area in table_areas:
# check for overlap
if te.y1 >= area[1] and te.y0 <= area[3]:
found = area
break
if found is None:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
found = None
for area in table_areas:
# check for overlap
if te.y1 >= area[1] and te.y0 <= area[3]:
found = area
break
if found is None:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
table_areas.pop(found)
updated_area = (
found[0],
min(te.y0, found[1]),
max(found[2], te.x),
max(found[3], te.y1),
)
table_areas[updated_area] = None
table_areas.pop(found)
updated_area = (
found[0],
min(te.y0, found[1]),
max(found[2], te.x),
max(found[3], te.y1),
)
table_areas[updated_area] = None

# extend table areas based on textlines that overlap
# vertically. it's possible that these textlines were
Expand Down Expand Up @@ -327,6 +332,8 @@ class Table:
Accuracy with which text was assigned to the cell.
whitespace : float
Percentage of whitespace in the table.
filename : str
Path of the original PDF
order : int
Table number on PDF page.
page : int
Expand All @@ -342,8 +349,15 @@ def __init__(self, cols, rows):
self.shape = (0, 0)
self.accuracy = 0
self.whitespace = 0
self.filename = None
self.order = None
self.page = None
self.flavor = None # Flavor of the parser that generated the table
self.pdf_size = None # Dimensions of the original PDF page
self.debug_info = None # Field holding debug data

self._image = None
self._image_path = None # Temporary file to hold an image of the pdf

def __repr__(self):
return f"<{self.__class__.__name__} shape={self.shape}>"
Expand Down Expand Up @@ -377,6 +391,30 @@ def parsing_report(self):
}
return report

def record_metadata(self, parser):
"""Record data about the origin of the table"""
self.flavor = parser.id
self.filename = parser.filename
self.debug_info = parser.debug_info
data = self.data
self.df = pd.DataFrame(data)
self.shape = self.df.shape

self.whitespace = compute_whitespace(data)
self.pdf_size = (parser.pdf_width, parser.pdf_height)

def get_pdf_image(self):
"""Compute pdf image and cache it"""
if self._image is None:
if self._image_path is None:
self._image_path = build_file_path_in_temp_dir(
os.path.basename(self.filename), ".png"
)
backend = ImageConversionBackend(use_fallback=True)
backend.convert(self.filename, self._image_path)
self._image = cv2.imread(self._image_path)
return self._image

def set_all_edges(self):
"""Sets all table edges to True."""
for row in self.cells:
Expand Down Expand Up @@ -686,8 +724,7 @@ def __getitem__(self, idx):
return self._tables[idx]

def __iter__(self):
for t in self._tables:
yield t
yield from self._tables

@staticmethod
def _format_func(table, f):
Expand Down
38 changes: 27 additions & 11 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .parsers import Lattice
from .parsers import Stream
from .utils import TemporaryDirectory
from .utils import build_file_path_in_temp_dir
from .utils import download_url
from .utils import get_page_layout
from .utils import get_rotation
Expand Down Expand Up @@ -101,12 +102,22 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp):

Parameters
----------
filepath : str
Filepath or URL of the PDF file.
page : int
Page number.
temp : str
Tmp directory.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa


Returns
-------
layout : object

dimensions : tuple
The dimensions of the pdf page

filepath : str
The path of the single page PDF - either the original, or a
normalized version.

"""
infile = PdfReader(filepath, strict=False)
Expand Down Expand Up @@ -149,7 +160,7 @@ def parse(
suppress_stdout=False,
parallel=False,
layout_kwargs=None,
**kwargs
**kwargs,
):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
Expand Down Expand Up @@ -189,7 +200,8 @@ def parse(
jobs = []
for p in self.pages:
j = pool.apply_async(
self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
self._parse_page,
(p, tempdir, parser, suppress_stdout, layout_kwargs),
)
jobs.append(j)

Expand All @@ -198,14 +210,14 @@ def parse(
tables.extend(t)
else:
for p in self.pages:
t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
t = self._parse_page(
p, tempdir, parser, suppress_stdout, layout_kwargs
)
tables.extend(t)

return TableList(sorted(tables))

def _parse_page(
self, page, tempdir, parser, suppress_stdout, layout_kwargs
):
def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs):
"""Extracts tables by calling parser.get_tables on a single
page PDF.

Expand All @@ -224,10 +236,14 @@ def _parse_page(
-------
tables : camelot.core.TableList
List of tables found in PDF.

"""
self._save_page(self.filepath, page, tempdir)
page_path = os.path.join(tempdir, f"page-{page}.pdf")
layout, dimensions = get_page_layout(page_path, **layout_kwargs)
parser._generate_layout(
page_path, layout, dimensions, page, layout_kwargs=layout_kwargs
)
tables = parser.extract_tables(
page_path, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
)
Expand Down
36 changes: 34 additions & 2 deletions camelot/parsers/base.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,51 @@
import os

from ..core import Table
from ..utils import get_page_layout
from ..utils import get_text_objects


class BaseParser:
"""Defines a base parser."""

def _generate_layout(self, filename, layout_kwargs):
def __init__(self, parser_id):
self.id = parser_id

# For plotting details of parsing algorithms
self.debug_info = {}

def _generate_layout(self, filename, layout, dimensions, page_idx, layout_kwargs):
self.filename = filename
self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
self.layout = layout
self.dimensions = dimensions
self.page = page_idx
self.images = get_text_objects(self.layout, ltype="image")
self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename)
self.imagename = "".join([self.rootname, ".png"])

"""Initialize new table object, ready to be populated

Parameters
----------
table_idx : int
Index of this table within the pdf page analyzed
cols : list
list of coordinate boundaries tuples (left, right)
rows : list
list of coordinate boundaries tuples (bottom, top)

Returns
-------
table : camelot.core.Table

"""

def _initialize_new_table(self, table_idx, cols, rows):
table = Table(cols, rows)
table.page = self.page
table.order = table_idx + 1
return table
36 changes: 19 additions & 17 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import locale
import logging
import os
import subprocess
import sys
import warnings

Expand All @@ -14,6 +15,7 @@
from ..image_processing import find_contours
from ..image_processing import find_joints
from ..image_processing import find_lines
from ..utils import build_file_path_in_temp_dir
from ..utils import compute_accuracy
from ..utils import compute_whitespace
from ..utils import get_table_index
Expand Down Expand Up @@ -108,12 +110,13 @@ def __init__(
backend="ghostscript",
**kwargs,
):
super().__init__("lattice")
self.table_regions = table_regions
self.table_areas = table_areas
self.process_background = process_background
self.line_scale = line_scale
self.copy_text = copy_text
self.shift_text = shift_text
self.shift_text = shift_text or ["l", "t"]
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
Expand All @@ -124,6 +127,8 @@ def __init__(
self.iterations = iterations
self.resolution = resolution
self.backend = Lattice._get_backend(backend)
self.image_path = None
self.pdf_image = None

@staticmethod
def _get_backend(backend):
Expand Down Expand Up @@ -242,15 +247,20 @@ def scale_areas(areas):
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
return scaled_areas

self.image, self.threshold = adaptive_threshold(
self.imagename,
self.image_path = build_file_path_in_temp_dir(
os.path.basename(self.filename), ".png"
)
self.backend.convert(self.filename, self.image_path)

self.pdf_image, self.threshold = adaptive_threshold(
self.image_path,
process_background=self.process_background,
blocksize=self.threshold_blocksize,
c=self.threshold_constant,
)

image_width = self.image.shape[1]
image_height = self.image.shape[0]
image_width = self.pdf_image.shape[1]
image_height = self.pdf_image.shape[0]
image_width_scaler = image_width / float(self.pdf_width)
image_height_scaler = image_height / float(self.pdf_height)
pdf_width_scaler = self.pdf_width / float(image_width)
Expand Down Expand Up @@ -336,7 +346,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
if v_s is None or h_s is None:
raise ValueError(f"No segments found on {self.rootname}")

table = Table(cols, rows)
table = self._initialize_new_table(table_idx, cols, rows)
# set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
# set table border edges to True
Expand Down Expand Up @@ -369,30 +379,22 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
if self.copy_text is not None:
table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)

data = table.data
table.df = pd.DataFrame(data)
table.shape = table.df.shape

whitespace = compute_whitespace(data)
table.flavor = "lattice"
table.record_metadata(self)
table.accuracy = accuracy
table.whitespace = whitespace
table.order = table_idx + 1
table.page = int(os.path.basename(self.rootname).replace("page-", ""))

# for plotting
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table._image = (self.image, self.table_bbox_unscaled)
table._image = self.pdf_image # Reuse the image used for calc
table._bbox_unscaled = self.table_bbox_unscaled
table._segments = (self.vertical_segments, self.horizontal_segments)
table._textedges = None

return table

def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
self._generate_layout(filename, layout_kwargs)
if not suppress_stdout:
logger.info(f"Processing {os.path.basename(self.rootname)}")

Expand Down
Loading