bosd · bosd · Apr 19, 2020 · Sep 21, 2024 · Sep 21, 2024 · Sep 21, 2024
diff --git a/camelot/core.py b/camelot/core.py
@@ -4,9 +4,15 @@
 import zipfile
 from operator import itemgetter
 
+import cv2
 import numpy as np
 import pandas as pd
 
+from .backends import ImageConversionBackend
+from .utils import build_file_path_in_temp_dir
+from .utils import compute_accuracy
+from .utils import compute_whitespace
+
 
 # minimum number of vertical textline intersections for a textedge
 # to be considered valid
@@ -149,7 +155,7 @@ def get_relevant(self):
         # get vertical textedges that intersect maximum number of
         # times with horizontal textlines
         relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
-        return self._textedges[relevant_align]
+        return list(filter(lambda te: te.is_valid, self._textedges[relevant_align]))
 
     def get_table_areas(self, textlines, relevant_textedges):
         """Returns a dict of interesting table areas on the PDF page
@@ -169,27 +175,26 @@ def pad(area, average_row_height):
 
         table_areas = {}
         for te in relevant_textedges:
-            if te.is_valid:
-                if not table_areas:
+            if not table_areas:
+                table_areas[(te.x, te.y0, te.x, te.y1)] = None
+            else:
+                found = None
+                for area in table_areas:
+                    # check for overlap
+                    if te.y1 >= area[1] and te.y0 <= area[3]:
+                        found = area
+                        break
+                if found is None:
                     table_areas[(te.x, te.y0, te.x, te.y1)] = None
                 else:
-                    found = None
-                    for area in table_areas:
-                        # check for overlap
-                        if te.y1 >= area[1] and te.y0 <= area[3]:
-                            found = area
-                            break
-                    if found is None:
-                        table_areas[(te.x, te.y0, te.x, te.y1)] = None
-                    else:
-                        table_areas.pop(found)
-                        updated_area = (
-                            found[0],
-                            min(te.y0, found[1]),
-                            max(found[2], te.x),
-                            max(found[3], te.y1),
-                        )
-                        table_areas[updated_area] = None
+                    table_areas.pop(found)
+                    updated_area = (
+                        found[0],
+                        min(te.y0, found[1]),
+                        max(found[2], te.x),
+                        max(found[3], te.y1),
+                    )
+                    table_areas[updated_area] = None
 
         # extend table areas based on textlines that overlap
         # vertically. it's possible that these textlines were
@@ -327,6 +332,8 @@ class Table:
         Accuracy with which text was assigned to the cell.
     whitespace : float
         Percentage of whitespace in the table.
+    filename : str
+        Path of the original PDF
     order : int
         Table number on PDF page.
     page : int
@@ -342,8 +349,15 @@ def __init__(self, cols, rows):
         self.shape = (0, 0)
         self.accuracy = 0
         self.whitespace = 0
+        self.filename = None
         self.order = None
         self.page = None
+        self.flavor = None  # Flavor of the parser that generated the table
+        self.pdf_size = None  # Dimensions of the original PDF page
+        self.debug_info = None  # Field holding debug data
+
+        self._image = None
+        self._image_path = None  # Temporary file to hold an image of the pdf
 
     def __repr__(self):
         return f"<{self.__class__.__name__} shape={self.shape}>"
@@ -377,6 +391,30 @@ def parsing_report(self):
         }
         return report
 
+    def record_metadata(self, parser):
+        """Record data about the origin of the table"""
+        self.flavor = parser.id
+        self.filename = parser.filename
+        self.debug_info = parser.debug_info
+        data = self.data
+        self.df = pd.DataFrame(data)
+        self.shape = self.df.shape
+
+        self.whitespace = compute_whitespace(data)
+        self.pdf_size = (parser.pdf_width, parser.pdf_height)
+
+    def get_pdf_image(self):
+        """Compute pdf image and cache it"""
+        if self._image is None:
+            if self._image_path is None:
+                self._image_path = build_file_path_in_temp_dir(
+                    os.path.basename(self.filename), ".png"
+                )
+                backend = ImageConversionBackend(use_fallback=True)
+                backend.convert(self.filename, self._image_path)
+            self._image = cv2.imread(self._image_path)
+        return self._image
+
     def set_all_edges(self):
         """Sets all table edges to True."""
         for row in self.cells:
@@ -686,8 +724,7 @@ def __getitem__(self, idx):
         return self._tables[idx]
 
     def __iter__(self):
-        for t in self._tables:
-            yield t
+        yield from self._tables
 
     @staticmethod
     def _format_func(table, f):

diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -12,6 +12,7 @@
 from .parsers import Lattice
 from .parsers import Stream
 from .utils import TemporaryDirectory
+from .utils import build_file_path_in_temp_dir
 from .utils import download_url
 from .utils import get_page_layout
 from .utils import get_rotation
@@ -101,12 +102,22 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
 
         Parameters
         ----------
-        filepath : str
-            Filepath or URL of the PDF file.
         page : int
             Page number.
-        temp : str
-            Tmp directory.
+        layout_kwargs : dict, optional (default: {})
+            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.  # noqa
+
+
+        Returns
+        -------
+        layout : object
+
+        dimensions : tuple
+            The dimensions of the pdf page
+
+        filepath : str
+            The path of the single page PDF - either the original, or a
+            normalized version.
 
         """
         infile = PdfReader(filepath, strict=False)
@@ -149,7 +160,7 @@ def parse(
         suppress_stdout=False,
         parallel=False,
         layout_kwargs=None,
-        **kwargs
+        **kwargs,
     ):
         """Extracts tables by calling parser.get_tables on all single
         page PDFs.
@@ -189,7 +200,8 @@ def parse(
                     jobs = []
                     for p in self.pages:
                         j = pool.apply_async(
-                            self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
+                            self._parse_page,
+                            (p, tempdir, parser, suppress_stdout, layout_kwargs),
                         )
                         jobs.append(j)
 
@@ -198,14 +210,14 @@ def parse(
                         tables.extend(t)
             else:
                 for p in self.pages:
-                    t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
+                    t = self._parse_page(
+                        p, tempdir, parser, suppress_stdout, layout_kwargs
+                    )
                     tables.extend(t)
 
         return TableList(sorted(tables))
 
-    def _parse_page(
-        self, page, tempdir, parser, suppress_stdout, layout_kwargs
-    ):
+    def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs):
         """Extracts tables by calling parser.get_tables on a single
         page PDF.
 
@@ -224,10 +236,14 @@ def _parse_page(
         -------
         tables : camelot.core.TableList
             List of tables found in PDF.
-        
+
         """
         self._save_page(self.filepath, page, tempdir)
         page_path = os.path.join(tempdir, f"page-{page}.pdf")
+        layout, dimensions = get_page_layout(page_path, **layout_kwargs)
+        parser._generate_layout(
+            page_path, layout, dimensions, page, layout_kwargs=layout_kwargs
+        )
         tables = parser.extract_tables(
             page_path, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
         )

diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py
@@ -1,19 +1,51 @@
 import os
 
+from ..core import Table
 from ..utils import get_page_layout
 from ..utils import get_text_objects
 
 
 class BaseParser:
     """Defines a base parser."""
 
-    def _generate_layout(self, filename, layout_kwargs):
+    def __init__(self, parser_id):
+        self.id = parser_id
+
+        # For plotting details of parsing algorithms
+        self.debug_info = {}
+
+    def _generate_layout(self, filename, layout, dimensions, page_idx, layout_kwargs):
         self.filename = filename
         self.layout_kwargs = layout_kwargs
-        self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
+        self.layout = layout
+        self.dimensions = dimensions
+        self.page = page_idx
         self.images = get_text_objects(self.layout, ltype="image")
         self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
         self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
         self.pdf_width, self.pdf_height = self.dimensions
         self.rootname, __ = os.path.splitext(self.filename)
         self.imagename = "".join([self.rootname, ".png"])
+
+    """Initialize new table object, ready to be populated
+
+    Parameters
+    ----------
+    table_idx : int
+        Index of this table within the pdf page analyzed
+    cols : list
+        list of coordinate boundaries tuples (left, right)
+    rows : list
+        list of coordinate boundaries tuples (bottom, top)
+
+    Returns
+    -------
+    table : camelot.core.Table
+
+    """
+
+    def _initialize_new_table(self, table_idx, cols, rows):
+        table = Table(cols, rows)
+        table.page = self.page
+        table.order = table_idx + 1
+        return table
diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
@@ -2,6 +2,7 @@
 import locale
 import logging
 import os
+import subprocess
 import sys
 import warnings
 
@@ -14,6 +15,7 @@
 from ..image_processing import find_contours
 from ..image_processing import find_joints
 from ..image_processing import find_lines
+from ..utils import build_file_path_in_temp_dir
 from ..utils import compute_accuracy
 from ..utils import compute_whitespace
 from ..utils import get_table_index
@@ -108,12 +110,13 @@ def __init__(
         backend="ghostscript",
         **kwargs,
     ):
+        super().__init__("lattice")
         self.table_regions = table_regions
         self.table_areas = table_areas
         self.process_background = process_background
         self.line_scale = line_scale
         self.copy_text = copy_text
-        self.shift_text = shift_text
+        self.shift_text = shift_text or ["l", "t"]
         self.split_text = split_text
         self.flag_size = flag_size
         self.strip_text = strip_text
@@ -124,6 +127,8 @@ def __init__(
         self.iterations = iterations
         self.resolution = resolution
         self.backend = Lattice._get_backend(backend)
+        self.image_path = None
+        self.pdf_image = None
 
     @staticmethod
     def _get_backend(backend):
@@ -242,15 +247,20 @@ def scale_areas(areas):
                 scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
             return scaled_areas
 
-        self.image, self.threshold = adaptive_threshold(
-            self.imagename,
+        self.image_path = build_file_path_in_temp_dir(
+            os.path.basename(self.filename), ".png"
+        )
+        self.backend.convert(self.filename, self.image_path)
+
+        self.pdf_image, self.threshold = adaptive_threshold(
+            self.image_path,
             process_background=self.process_background,
             blocksize=self.threshold_blocksize,
             c=self.threshold_constant,
         )
 
-        image_width = self.image.shape[1]
-        image_height = self.image.shape[0]
+        image_width = self.pdf_image.shape[1]
+        image_height = self.pdf_image.shape[0]
         image_width_scaler = image_width / float(self.pdf_width)
         image_height_scaler = image_height / float(self.pdf_height)
         pdf_width_scaler = self.pdf_width / float(image_width)
@@ -336,7 +346,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
         if v_s is None or h_s is None:
             raise ValueError(f"No segments found on {self.rootname}")
 
-        table = Table(cols, rows)
+        table = self._initialize_new_table(table_idx, cols, rows)
         # set table edges to True using ver+hor lines
         table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
         # set table border edges to True
@@ -369,30 +379,22 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
         if self.copy_text is not None:
             table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
 
-        data = table.data
-        table.df = pd.DataFrame(data)
-        table.shape = table.df.shape
-
-        whitespace = compute_whitespace(data)
-        table.flavor = "lattice"
+        table.record_metadata(self)
         table.accuracy = accuracy
-        table.whitespace = whitespace
-        table.order = table_idx + 1
-        table.page = int(os.path.basename(self.rootname).replace("page-", ""))
 
         # for plotting
         _text = []
         _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
         _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
         table._text = _text
-        table._image = (self.image, self.table_bbox_unscaled)
+        table._image = self.pdf_image  # Reuse the image used for calc
+        table._bbox_unscaled = self.table_bbox_unscaled
         table._segments = (self.vertical_segments, self.horizontal_segments)
         table._textedges = None
 
         return table
 
     def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
-        self._generate_layout(filename, layout_kwargs)
         if not suppress_stdout:
             logger.info(f"Processing {os.path.basename(self.rootname)}")