diff --git a/camelot/backends/ghostscript_backend.py b/camelot/backends/ghostscript_backend.py index 9379ec5..ed7093b 100644 --- a/camelot/backends/ghostscript_backend.py +++ b/camelot/backends/ghostscript_backend.py @@ -17,12 +17,13 @@ def installed_windows(): class GhostscriptBackend: def installed(self): - if sys.platform in ["linux", "darwin"]: - return installed_posix() - elif sys.platform == "win32": - return installed_windows() - else: - return installed_posix() + try: + import ghostscript + except RuntimeError: + return None + return not None + + def convert(self, pdf_path, png_path, resolution=300): if not self.installed(): diff --git a/camelot/cli.py b/camelot/cli.py index 8aad5eb..414a44d 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -290,13 +290,204 @@ def stream(c, *args, **kwargs): columns = list(kwargs["columns"]) kwargs["columns"] = None if not columns else columns - margins = conf.pop('margins') + margins = conf.pop("margins") if margins is None: layout_kwargs = {} else: - layout_kwargs = {"char_margin": margins[0], "line_margin": margins[1], "word_margin": margins[2]} - + layout_kwargs = { + "char_margin": margins[0], + "line_margin": margins[1], + "word_margin": margins[2], + } + + if plot_type is not None: + if not _HAS_MPL: + raise ImportError("matplotlib is required for plotting.") + else: + if output is None: + raise click.UsageError("Please specify output file path using --output") + if f is None: + raise click.UsageError("Please specify output file format using --format") + + tables = read_pdf( + filepath, + pages=pages, + flavor="stream", + suppress_stdout=quiet, + layout_kwargs=layout_kwargs, + **kwargs, + ) + click.echo(f"Found {tables.n} tables") + if plot_type is not None: + for table in tables: + plot(table, kind=plot_type) + plt.show() + else: + tables.export(output, f=f, compress=compress) + + +@cli.command("hybrid") +@click.option( + "-R", + "--table_regions", + default=[], + multiple=True, + help="Page regions to analyze. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-T", + "--table_areas", + default=[], + multiple=True, + help="Table areas to process. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-C", + "--columns", + default=[], + multiple=True, + help="X coordinates of column separators.", +) +@click.option( + "-e", + "--edge_tol", + default=50, + help="Tolerance parameter" " for extending textedges vertically.", +) +@click.option( + "-r", + "--row_tol", + default=2, + help="Tolerance parameter" " used to combine text vertically, to generate rows.", +) +@click.option( + "-c", + "--column_tol", + default=0, + help="Tolerance parameter" + " used to combine text horizontally, to generate columns.", +) +@click.option( + "-plot", + "--plot_type", + type=click.Choice(["text", "grid", "contour", "textedge"]), + help="Plot elements found on PDF page for visual debugging.", +) +@click.argument("filepath", type=click.Path(exists=True)) +@pass_config +def hybrid(c, *args, **kwargs): + """Use spaces between text to parse the table.""" + conf = c.config + pages = conf.pop("pages") + output = conf.pop("output") + f = conf.pop("format") + compress = conf.pop("zip") + quiet = conf.pop("quiet") + plot_type = kwargs.pop("plot_type") + filepath = kwargs.pop("filepath") + kwargs.update(conf) + + table_regions = list(kwargs["table_regions"]) + kwargs["table_regions"] = None if not table_regions else table_regions + table_areas = list(kwargs["table_areas"]) + kwargs["table_areas"] = None if not table_areas else table_areas + columns = list(kwargs["columns"]) + kwargs["columns"] = None if not columns else columns + + if plot_type is not None: + if not _HAS_MPL: + raise ImportError("matplotlib is required for plotting.") + else: + if output is None: + raise click.UsageError("Please specify output file path using --output") + if f is None: + raise click.UsageError("Please specify output file format using --format") + + tables = read_pdf( + filepath, pages=pages, flavor="hybrid", suppress_stdout=quiet, **kwargs + ) + click.echo(f"Found {tables.n} tables") + if plot_type is not None: + for table in tables: + plot(table, kind=plot_type) + plt.show() + else: + tables.export(output, f=f, compress=compress) + + +@cli.command("network") +@click.option( + "-R", + "--table_regions", + default=[], + multiple=True, + help="Page regions to analyze. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-T", + "--table_areas", + default=[], + multiple=True, + help="Table areas to process. Example: x1,y1,x2,y2" + " where x1, y1 -> left-top and x2, y2 -> right-bottom.", +) +@click.option( + "-C", + "--columns", + default=[], + multiple=True, + help="X coordinates of column separators.", +) +@click.option( + "-e", + "--edge_tol", + default=50, + help="Tolerance parameter" " for extending textedges vertically.", +) +@click.option( + "-r", + "--row_tol", + default=2, + help="Tolerance parameter" " used to combine text vertically, to generate rows.", +) +@click.option( + "-c", + "--column_tol", + default=0, + help="Tolerance parameter" + " used to combine text horizontally, to generate columns.", +) +@click.option( + "-plot", + "--plot_type", + type=click.Choice(["text", "grid", "contour", "textedge"]), + help="Plot elements found on PDF page for visual debugging.", +) +@click.argument("filepath", type=click.Path(exists=True)) +@pass_config +def network(c, *args, **kwargs): + """Use spaces between text to parse the table.""" + conf = c.config + pages = conf.pop("pages") + output = conf.pop("output") + f = conf.pop("format") + compress = conf.pop("zip") + quiet = conf.pop("quiet") + plot_type = kwargs.pop("plot_type") + filepath = kwargs.pop("filepath") + kwargs.update(conf) + + table_regions = list(kwargs["table_regions"]) + kwargs["table_regions"] = None if not table_regions else table_regions + table_areas = list(kwargs["table_areas"]) + kwargs["table_areas"] = None if not table_areas else table_areas + columns = list(kwargs["columns"]) + kwargs["columns"] = None if not columns else columns + if plot_type is not None: if not _HAS_MPL: raise ImportError("matplotlib is required for plotting.") @@ -307,7 +498,7 @@ def stream(c, *args, **kwargs): raise click.UsageError("Please specify output file format using --format") tables = read_pdf( - filepath, pages=pages, flavor="stream", suppress_stdout=quiet, layout_kwargs=layout_kwargs, **kwargs + filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs ) click.echo(f"Found {tables.n} tables") if plot_type is not None: diff --git a/camelot/core.py b/camelot/core.py index 03c7985..69469c8 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -1,12 +1,21 @@ +import math import os import sqlite3 import tempfile import zipfile from operator import itemgetter +import cv2 import numpy as np import pandas as pd +from .backends import ImageConversionBackend +from .utils import build_file_path_in_temp_dir +from .utils import compute_accuracy +from .utils import compute_whitespace +from .utils import get_index_closest_point +from .utils import get_textline_coords + # minimum number of vertical textline intersections for a textedge # to be considered valid @@ -15,14 +24,66 @@ TABLE_AREA_PADDING = 10 -class TextEdge: +HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"] +VERTICAL_ALIGNMENTS = ["top", "bottom", "center"] +ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS + + +class TextAlignment(object): + """Represents a list of textlines sharing an alignment on a coordinate. + The alignment can be left/right/middle or top/bottom/center. + (PDF coordinate space) + + Parameters + ---------- + coord : float + coordinate of the initial text edge. Depending on the alignment + it could be a vertical or horizontal coordinate. + textline : obj + the original textline to start the alignment + align : str + Name of the alignment (e.g. "left", "top", etc) + + Attributes + ---------- + coord : float + The coordinate aligned averaged out across textlines. It can be along + the x or y axis. + textlines : array + Array of textlines that demonstrate this alignment. + align : str + Name of the alignment (e.g. "left", "top", etc) + """ + + def __init__(self, coord, textline, align): + self.coord = coord + self.textlines = [textline] + self.align = align + + def __repr__(self): + text_inside = " | ".join( + map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "") + return f"" + + def register_aligned_textline(self, textline, coord): + """Update new textline to this alignment, adapting its average.""" + # Increase the intersections for this segment, expand it up, + # and adjust the x based on the new value + self.coord = (self.coord * len(self.textlines) + coord) / \ + float(len(self.textlines) + 1) + self.textlines.append(textline) + + +class TextEdge(TextAlignment): """Defines a text edge coordinates relative to a left-bottom origin. (PDF coordinate space) + An edge is an alignment bounded over a segment. Parameters ---------- - x : float - x-coordinate of the text edge. + coord : float + coordinate of the text edge. Can be x or y. y0 : float y-coordinate of bottommost point. y1 : float @@ -32,93 +93,114 @@ class TextEdge: Attributes ---------- - intersections: int - Number of intersections with horizontal text rows. is_valid: bool - A text edge is valid if it intersections with at least + A text edge is valid if it intersects with at least TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows. """ - def __init__(self, x, y0, y1, align="left"): - self.x = x - self.y0 = y0 - self.y1 = y1 - self.align = align - self.intersections = 0 + def __init__(self, coord, textline, align): + super().__init__(coord, textline, align) + self.y0 = textline.y0 + self.y1 = textline.y1 self.is_valid = False def __repr__(self): - x = round(self.x, 2) + x = round(self.coord, 2) y0 = round(self.y0, 2) y1 = round(self.y1, 2) return ( f"" ) - def update_coords(self, x, y0, edge_tol=50): + def update_coords(self, x, textline, edge_tol=50): """Updates the text edge's x and bottom y coordinates and sets the is_valid attribute. """ - if np.isclose(self.y0, y0, atol=edge_tol): - self.x = (self.intersections * self.x + x) / float(self.intersections + 1) - self.y0 = y0 - self.intersections += 1 + if math.isclose(self.y0, textline.y0, abs_tol=edge_tol): + self.register_aligned_textline(textline, x) + self.y0 = textline.y0 # a textedge is valid only if it extends uninterrupted # over a required number of textlines - if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS: + if len(self.textlines) > TEXTEDGE_REQUIRED_ELEMENTS: self.is_valid = True -class TextEdges: +class TextAlignments(): + """Defines a dict of text edges across reference alignments. + """ + + def __init__(self, alignment_names): + # For each possible alignment, list of tuples coordinate/textlines + self._text_alignments = {} + for alignment_name in alignment_names: + self._text_alignments[alignment_name] = [] + + @staticmethod + def _create_new_text_alignment(coord, textline, align): + return TextAlignment(coord, textline, align) + + def _update_alignment(self, alignment, coord, textline): + return NotImplemented + + def _register_textline(self, textline): + """Updates an existing text edge in the current dict. + """ + coords = get_textline_coords(textline) + for alignment_id, alignment_array in self._text_alignments.items(): + coord = coords[alignment_id] + + # Find the index of the closest existing element (or 0 if none) + idx_closest = get_index_closest_point( + coord, alignment_array, fn=lambda x: x.coord + ) + + # Check if the edges before/after are close enough + # that it can be considered aligned + idx_insert = None + if idx_closest is None: + idx_insert = 0 + else: + coord_closest = alignment_array[idx_closest].coord + # Note: np.isclose is slow! + if coord - 0.5 < coord_closest < coord + 0.5: + self._update_alignment( + alignment_array[idx_closest], + coord, + textline + ) + elif coord_closest < coord: + idx_insert = idx_closest + 1 + else: + idx_insert = idx_closest + if idx_insert is not None: + new_alignment = self._create_new_text_alignment( + coord, textline, alignment_id + ) + alignment_array.insert(idx_insert, new_alignment) + + +class TextEdges(TextAlignments): """Defines a dict of left, right and middle text edges found on the PDF page. The dict has three keys based on the alignments, and each key's value is a list of camelot.core.TextEdge objects. """ def __init__(self, edge_tol=50): + super().__init__(HORIZONTAL_ALIGNMENTS) self.edge_tol = edge_tol - self._textedges = {"left": [], "right": [], "middle": []} - @staticmethod - def get_x_coord(textline, align): - """Returns the x coordinate of a text row based on the - specified alignment. - """ - x_left = textline.x0 - x_right = textline.x1 - x_middle = x_left + (x_right - x_left) / 2.0 - x_coord = {"left": x_left, "middle": x_middle, "right": x_right} - return x_coord[align] - - def find(self, x_coord, align): - """Returns the index of an existing text edge using - the specified x coordinate and alignment. - """ - for i, te in enumerate(self._textedges[align]): - if np.isclose(te.x, x_coord, atol=0.5): - return i - return None + def _create_new_text_alignment(self, coord, textline, align): + # In TextEdges, each alignment is a TextEdge + return TextEdge(coord, textline, align) - def add(self, textline, align): + def add(self, coord, textline, align): """Adds a new text edge to the current dict.""" - x = self.get_x_coord(textline, align) - y0 = textline.y0 - y1 = textline.y1 - te = TextEdge(x, y0, y1, align=align) - self._textedges[align].append(te) - - def update(self, textline): - """Updates an existing text edge in the current dict.""" - for align in ["left", "right", "middle"]: - x_coord = self.get_x_coord(textline, align) - idx = self.find(x_coord, align) - if idx is None: - self.add(textline, align) - else: - self._textedges[align][idx].update_coords( - x_coord, textline.y0, edge_tol=self.edge_tol - ) + te = self._create_new_text_alignment(coord, textline, align) + self._text_alignments[align].append(te) + + def _update_alignment(self, alignment, coord, textline): + alignment.update_coords(coord, textline, self.edge_tol) def generate(self, textlines): """Generates the text edges dict based on horizontal text @@ -126,7 +208,7 @@ def generate(self, textlines): """ for tl in textlines: if len(tl.get_text().strip()) > 1: # TODO: hacky - self.update(tl) + self._register_textline(tl) def get_relevant(self): """Returns the list of relevant text edges (all share the same @@ -135,13 +217,19 @@ def get_relevant(self): """ intersections_sum = { "left": sum( - te.intersections for te in self._textedges["left"] if te.is_valid + len(te.textlines) + for te in self._text_alignments["left"] + if te.is_valid ), "right": sum( - te.intersections for te in self._textedges["right"] if te.is_valid + len(te.textlines) + for te in self._text_alignments["right"] + if te.is_valid ), "middle": sum( - te.intersections for te in self._textedges["middle"] if te.is_valid + len(te.textlines) + for te in self._text_alignments["middle"] + if te.is_valid ), } @@ -149,7 +237,7 @@ def get_relevant(self): # get vertical textedges that intersect maximum number of # times with horizontal textlines relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] - return self._textedges[relevant_align] + return list(filter(lambda te: te.is_valid, self._text_alignments[relevant_align])) def get_table_areas(self, textlines, relevant_textedges): """Returns a dict of interesting table areas on the PDF page @@ -165,31 +253,30 @@ def pad(area, average_row_height): return (x0, y0, x1, y1) # sort relevant textedges in reading order - relevant_textedges.sort(key=lambda te: (-te.y0, te.x)) + relevant_textedges.sort(key=lambda te: (-te.y0, te.coord)) table_areas = {} for te in relevant_textedges: - if te.is_valid: - if not table_areas: - table_areas[(te.x, te.y0, te.x, te.y1)] = None + if not table_areas: + table_areas[(te.coord, te.y0, te.coord, te.y1)] = None + else: + found = None + for area in table_areas: + # check for overlap + if te.y1 >= area[1] and te.y0 <= area[3]: + found = area + break + if found is None: + table_areas[(te.coord, te.y0, te.coord, te.y1)] = None else: - found = None - for area in table_areas: - # check for overlap - if te.y1 >= area[1] and te.y0 <= area[3]: - found = area - break - if found is None: - table_areas[(te.x, te.y0, te.x, te.y1)] = None - else: - table_areas.pop(found) - updated_area = ( - found[0], - min(te.y0, found[1]), - max(found[2], te.x), - max(found[3], te.y1), - ) - table_areas[updated_area] = None + table_areas.pop(found) + updated_area = ( + found[0], + min(te.y0, found[1]), + max(found[2], te.coord), + max(found[3], te.y1), + ) + table_areas[updated_area] = None # extend table areas based on textlines that overlap # vertically. it's possible that these textlines were @@ -327,6 +414,8 @@ class Table: Accuracy with which text was assigned to the cell. whitespace : float Percentage of whitespace in the table. + filename : str + Path of the original PDF order : int Table number on PDF page. page : int @@ -342,8 +431,17 @@ def __init__(self, cols, rows): self.shape = (0, 0) self.accuracy = 0 self.whitespace = 0 + self.filename = None self.order = None self.page = None + self.flavor = None # Flavor of the parser that generated the table + self.pdf_size = None # Dimensions of the original PDF page + self._bbox = None # Bounding box in original document + self.parse = None # Parse information + self.parse_details = None # Field holding debug data + + self._image = None + self._image_path = None # Temporary file to hold an image of the pdf def __repr__(self): return f"<{self.__class__.__name__} shape={self.shape}>" @@ -377,6 +475,32 @@ def parsing_report(self): } return report + def record_metadata(self, parser): + """Record data about the origin of the table""" + self.flavor = parser.id + self.filename = parser.filename + self.debug_info = parser.debug_info + if parser.copy_text is not None: + self.copy_spanning_text(parser.copy_text) + data = self.data + self.df = pd.DataFrame(data) + self.shape = self.df.shape + + self.whitespace = compute_whitespace(data) + self.pdf_size = (parser.pdf_width, parser.pdf_height) + + def get_pdf_image(self): + """Compute pdf image and cache it""" + if self._image is None: + if self._image_path is None: + self._image_path = build_file_path_in_temp_dir( + os.path.basename(self.filename), ".png" + ) + backend = ImageConversionBackend(use_fallback=True) + backend.convert(self.filename, self._image_path) + self._image = cv2.imread(self._image_path) + return self._image + def set_all_edges(self): """Sets all table edges to True.""" for row in self.cells: @@ -560,6 +684,34 @@ def set_span(self): cell.hspan = True return self + def copy_spanning_text(self, copy_text=None): + """Copies over text in empty spanning cells. + Parameters + ---------- + copy_text : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + Returns + ------- + t : camelot.core.Table + """ + for f in copy_text: + if f == "h": + for i in range(len(self.cells)): + for j in range(len(self.cells[i])): + if self.cells[i][j].text.strip() == "": + if self.cells[i][j].hspan and not self.cells[i][j].left: + self.cells[i][j].text = self.cells[i][j - 1].text + elif f == "v": + for i in range(len(self.cells)): + for j in range(len(self.cells[i])): + if self.cells[i][j].text.strip() == "": + if self.cells[i][j].vspan and not self.cells[i][j].top: + self.cells[i][j].text = self.cells[i - 1][j].text + return self + def to_csv(self, path, **kwargs): """Writes Table to a comma-separated values (csv) file. @@ -686,8 +838,7 @@ def __getitem__(self, idx): return self._tables[idx] def __iter__(self): - for t in self._tables: - yield t + yield from self._tables @staticmethod def _format_func(table, f): diff --git a/camelot/handlers.py b/camelot/handlers.py index 74ddde7..99a8c54 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -9,15 +9,24 @@ from pypdf._utils import StrByteType from .core import TableList +from .parsers import Hybrid from .parsers import Lattice +from .parsers import Network from .parsers import Stream from .utils import TemporaryDirectory +from .utils import build_file_path_in_temp_dir from .utils import download_url from .utils import get_page_layout from .utils import get_rotation from .utils import get_text_objects from .utils import is_url +PARSERS = { + "lattice": Lattice, + "stream": Stream, + "network": Network, + "hybrid": Hybrid, +} class PDFHandler: """Handles all operations like temp directory creation, splitting @@ -33,10 +42,14 @@ class PDFHandler: Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. - + debug : bool, optional (default: False) + Whether the parser should store debug information during parsing. """ - def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None): + def __init__( + self, filepath: Union[StrByteType, Path], pages="1", password=None, debug=False + ): + self.debug = debug if is_url(filepath): filepath = download_url(filepath) self.filepath: Union[StrByteType, Path] = filepath @@ -101,12 +114,22 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp): Parameters ---------- - filepath : str - Filepath or URL of the PDF file. page : int Page number. - temp : str - Tmp directory. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams `_ kwargs. # noqa + + + Returns + ------- + layout : object + + dimensions : tuple + The dimensions of the pdf page + + filepath : str + The path of the single page PDF - either the original, or a + normalized version. """ infile = PdfReader(filepath, strict=False) @@ -149,7 +172,7 @@ def parse( suppress_stdout=False, parallel=False, layout_kwargs=None, - **kwargs + **kwargs, ): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -157,7 +180,7 @@ def parse( Parameters ---------- flavor : str (default: 'lattice') - The parsing method to use ('lattice' or 'stream'). + The parsing method to use. Lattice is used by default. suppress_stdout : bool (default: False) Suppress logs and warnings. @@ -179,7 +202,10 @@ def parse( layout_kwargs = {} tables = [] - parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) + # parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) + parser_obj = PARSERS[flavor] + parser = parser_obj(debug=self.debug, **kwargs) + with TemporaryDirectory() as tempdir: cpu_count = mp.cpu_count() # Using multiprocessing only when cpu_count > 1 to prevent a stallness issue @@ -189,7 +215,8 @@ def parse( jobs = [] for p in self.pages: j = pool.apply_async( - self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs) + self._parse_page, + (p, tempdir, parser, suppress_stdout, layout_kwargs), ) jobs.append(j) @@ -198,14 +225,14 @@ def parse( tables.extend(t) else: for p in self.pages: - t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs) + t = self._parse_page( + p, tempdir, parser, suppress_stdout, layout_kwargs + ) tables.extend(t) return TableList(sorted(tables)) - def _parse_page( - self, page, tempdir, parser, suppress_stdout, layout_kwargs - ): + def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs): """Extracts tables by calling parser.get_tables on a single page PDF. @@ -213,8 +240,8 @@ def _parse_page( ---------- page : str Page number to parse - parser : Lattice or Stream - The parser to use (Lattice or Stream). + parser : Lattice, Stream, Network or Hybrid + The parser to use. suppress_stdout : bool Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) @@ -224,11 +251,13 @@ def _parse_page( ------- tables : camelot.core.TableList List of tables found in PDF. - + """ self._save_page(self.filepath, page, tempdir) page_path = os.path.join(tempdir, f"page-{page}.pdf") - tables = parser.extract_tables( - page_path, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs + layout, dimensions = get_page_layout(page_path, **layout_kwargs) + parser.prepare_page_parse( + page_path, layout, dimensions, page, layout_kwargs=layout_kwargs ) + tables = parser.extract_tables() return tables diff --git a/camelot/io.py b/camelot/io.py index 1271882..215839a 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -17,6 +17,7 @@ def read_pdf( suppress_stdout=False, parallel=False, layout_kwargs=None, + debug=False, **kwargs ): """Read PDF and return extracted tables. @@ -110,9 +111,10 @@ def read_pdf( """ if layout_kwargs is None: layout_kwargs = {} - if flavor not in ["lattice", "stream"]: + if flavor not in ["lattice", "stream", "network", "hybrid"]: raise NotImplementedError( - "Unknown flavor specified." " Use either 'lattice' or 'stream'" + "Unknown flavor specified." + " Use either 'lattice', 'stream', 'network' or 'hybrid'" ) with warnings.catch_warnings(): @@ -120,7 +122,7 @@ def read_pdf( warnings.simplefilter("ignore") validate_input(kwargs, flavor=flavor) - p = PDFHandler(filepath, pages=pages, password=password) + p = PDFHandler(filepath, pages=pages, password=password, debug=debug) kwargs = remove_extra(kwargs, flavor=flavor) tables = p.parse( flavor=flavor, diff --git a/camelot/parsers/__init__.py b/camelot/parsers/__init__.py index 9019058..4e35796 100644 --- a/camelot/parsers/__init__.py +++ b/camelot/parsers/__init__.py @@ -1,2 +1,4 @@ from .lattice import Lattice from .stream import Stream +from .network import Network +from .hybrid import Hybrid diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 5e4b35f..ed327a1 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -1,19 +1,455 @@ +"""Defines a base parser. As well as generic methods for other parsers.""" + import os +import warnings + +import numpy as np +import pandas as pd -from ..utils import get_page_layout +from ..core import Table +from ..utils import bbox_from_str +from ..utils import compute_accuracy +from ..utils import compute_whitespace +from ..utils import get_table_index from ..utils import get_text_objects +from ..utils import text_in_bbox class BaseParser: """Defines a base parser.""" - def _generate_layout(self, filename, layout_kwargs): + def __init__( + self, + parser_id, + table_regions=None, + table_areas=None, + copy_text=None, + split_text=False, + strip_text="", + shift_text=None, + flag_size=False, + debug=False, + ): + self.id = parser_id + self.table_regions = table_regions + self.table_areas = table_areas + self.table_bbox_parses = {} + + self.columns = None + self.copy_text = copy_text + self.split_text = split_text + self.strip_text = strip_text + self.shift_text = shift_text + + self.flag_size = flag_size + + self.rootname = None + self.t_bbox = None + + # For plotting details of parsing algorithms + self.resolution = 300 # default plotting resolution of the PDF. + self.parse_details = {} + if not debug: + self.parse_details = None + + def table_bboxes(self): + return sorted(self.table_bbox_parses.keys(), key=lambda x: x[1], reverse=True) + + def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwargs): + """Prepare the page for parsing.""" self.filename = filename self.layout_kwargs = layout_kwargs - self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs) + self.layout = layout + self.dimensions = dimensions + self.page = page_idx self.images = get_text_objects(self.layout, ltype="image") self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text") self.vertical_text = get_text_objects(self.layout, ltype="vertical_text") self.pdf_width, self.pdf_height = self.dimensions self.rootname, __ = os.path.splitext(self.filename) - self.imagename = "".join([self.rootname, ".png"]) + + if self.parse_details is not None: + self.parse_details["table_regions"] = self.table_regions + self.parse_details["table_areas"] = self.table_areas + + def _apply_regions_filter(self, textlines): + """If regions have been specified, filter textlines to these regions. + + Parameters + ---------- + textlines : list + list of textlines to be filtered + + Returns + ------- + filtered_textlines : list of textlines within the regions specified + + """ + filtered_textlines = [] + if self.table_regions is None: + filtered_textlines.extend(textlines) + else: + for region_str in self.table_regions: + region_text = text_in_bbox(bbox_from_str(region_str), textlines) + filtered_textlines.extend(region_text) + return filtered_textlines + + def _document_has_no_text(self): + """Detects image only documents and warns. + + Returns + ------- + has_no_text : bool + Whether the document doesn't have any text at all. + """ + if not self.horizontal_text: + rootname = os.path.basename(self.rootname) + if self.images: + warnings.warn( + "{rootname} is image-based, " + "camelot only works on text-based pages.".format(rootname=rootname) + ) + else: + warnings.warn(f"No tables found on {rootname}") + return True + return False + + def _initialize_new_table(self, table_idx, bbox, cols, rows): + """Initialize new table object, ready to be populated. + + Parameters + ---------- + table_idx : int + Index of this table within the pdf page analyzed + bbox : set + bounding box of this table within the pdf page analyzed + cols : list + list of coordinate boundaries tuples (left, right) + rows : list + list of coordinate boundaries tuples (bottom, top) + + Returns + ------- + table : camelot.core.Table + + """ + table = Table(cols, rows) + table.page = self.page + table.order = table_idx + 1 + table._bbox = bbox + return table + + @staticmethod + def _reduce_index(t, idx, shift_text): + """ + Reduces index of a text object if it lies within a spanning + cell. Only useful for some parsers (e.g. Lattice), base method is a + noop. + """ + return idx + + def compute_parse_errors(self, table): + pos_errors = [] + # TODO: have a single list in place of two directional ones? + # sorted on x-coordinate based on reading order i.e. LTR or RTL + for direction in ["vertical", "horizontal"]: + for t in self.t_bbox[direction]: + indices, error = get_table_index( + table, + t, + direction, + split_text=self.split_text, + flag_size=self.flag_size, + strip_text=self.strip_text, + ) + if indices[0][:2] != (-1, -1): + pos_errors.append(error) + indices = type(self)._reduce_index( + table, indices, shift_text=self.shift_text + ) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].text = text + return pos_errors + + def _generate_columns_and_rows(self, bbox, user_cols): + # Pure virtual, must be defined by the derived parser + raise NotImplementedError() + + def _generate_table(self, table_idx, bbox, cols, rows, **kwargs): + # Pure virtual, must be defined by the derived parser + raise NotImplementedError() + + def _generate_table_bbox(self): + # Pure virtual, must be defined by the derived parser + raise NotImplementedError() + + def extract_tables(self): + """Extract tables from the document.""" + if self._document_has_no_text(): + return [] + + # Identify plausible areas within the doc where tables lie, + # populate table_bbox keys with these areas. + self._generate_table_bbox() + + _tables = [] + # sort tables based on y-coord + for table_idx, bbox in enumerate(self.table_bboxes()): + if self.columns is not None and self.columns[table_idx] != "": + # user has to input boundary columns too + # take (0, pdf_width) by default + # similar to else condition + # len can't be 1 + user_cols = self.columns[table_idx].split(",") + user_cols = [float(c) for c in user_cols] + else: + user_cols = None + + cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols) + table = self._generate_table(table_idx, bbox, cols, rows, v_s=v_s, h_s=h_s) + _tables.append(table) + + return _tables + + def record_parse_metadata(self, table): + """Record data about the origin of the table.""" + table.flavor = self.id + table.filename = self.filename + table.parse = self.table_bbox_parses[table._bbox] + table.parse_details = self.parse_details + pos_errors = self.compute_parse_errors(table) + table.accuracy = compute_accuracy([[100, pos_errors]]) + + if self.copy_text is not None: + table.copy_spanning_text(self.copy_text) + + data = table.data + table.df = pd.DataFrame(data) + table.shape = table.df.shape + + table.whitespace = compute_whitespace(data) + table.pdf_size = (self.pdf_width, self.pdf_height) + + _text = [] + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) + _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) + table._text = _text + table.textlines = self.horizontal_text + self.vertical_text + + +class TextBaseParser(BaseParser): + """Base class for all text parsers.""" + + def __init__( + self, + parser_id, + table_regions=None, + table_areas=None, + columns=None, + flag_size=False, + split_text=False, + strip_text="", + edge_tol=50, + row_tol=2, + column_tol=0, + debug=False, + **kwargs, + ): + """Initialize the text base parser class with default values.""" + super().__init__( + parser_id, + table_regions=table_regions, + table_areas=table_areas, + split_text=split_text, + strip_text=strip_text, + flag_size=flag_size, + debug=debug, + ) + self.columns = columns + self._validate_columns() + self.edge_tol = edge_tol + self.row_tol = row_tol + self.column_tol = column_tol + + @staticmethod + def _group_rows(text, row_tol=2): + """ + Groups PDFMiner text objects into rows vertically + within a tolerance. + + Parameters + ---------- + text : list + List of PDFMiner text objects. + row_tol : int, optional (default: 2) + + Returns + ------- + rows : list + Two-dimensional list of text objects grouped into rows. + + """ + row_y = None + rows = [] + temp = [] + non_empty_text = [t for t in text if t.get_text().strip()] + for t in non_empty_text: + # is checking for upright necessary? + # if t.get_text().strip() and all([obj.upright \ + # for obj in t._objs + # if type(obj) is LTChar]): + if row_y is None: + row_y = t.y0 + elif not np.isclose(row_y, t.y0, atol=row_tol): + rows.append(sorted(temp, key=lambda t: t.x0)) + temp = [] + # We update the row's bottom as we go, to be forgiving if there + # is a gradual change across multiple columns. + row_y = t.y0 + temp.append(t) + rows.append(sorted(temp, key=lambda t: t.x0)) + return rows + + @staticmethod + def _merge_columns(cl, column_tol=0): + """Merges column boundaries horizontally if they overlap + or lie within a tolerance. + + Parameters + ---------- + cl : list + List of column x-coordinate tuples. + column_tol : int, optional (default: 0) + + Returns + ------- + merged : list + List of merged column x-coordinate tuples. + + """ + merged = [] + for higher in cl: + if not merged: + merged.append(higher) + else: + lower = merged[-1] + if column_tol >= 0: + if higher[0] <= lower[1] or np.isclose( + higher[0], lower[1], atol=column_tol + ): + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + elif column_tol < 0: + if higher[0] <= lower[1]: + if np.isclose(higher[0], lower[1], atol=abs(column_tol)): + merged.append(higher) + else: + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + return merged + + @staticmethod + def _join_rows(rows_grouped, text_y_max, text_y_min): + """ + Makes row coordinates continuous. For the row to "touch" + we split the existing gap between them in half. + + Parameters + ---------- + rows_grouped : list + Two-dimensional list of text objects grouped into rows. + text_y_max : int + text_y_min : int + + Returns + ------- + rows : list + List of continuous row y-coordinate tuples. + + """ + row_boundaries = [ + [max(t.y1 for t in r), min(t.y0 for t in r)] for r in rows_grouped + ] + for i in range(0, len(row_boundaries) - 1): + top_row = row_boundaries[i] + bottom_row = row_boundaries[i + 1] + top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2 + row_boundaries[0][0] = text_y_max + row_boundaries[-1][1] = text_y_min + return row_boundaries + + @staticmethod + def _add_columns(cols, text, row_tol): + """Adds columns to existing list by taking into account + the text that lies outside the current column x-coordinates. + + Parameters + ---------- + cols : list + List of column x-coordinate tuples. + text : list + List of PDFMiner text objects. + ytol : int + + Returns + ------- + cols : list + Updated list of column x-coordinate tuples. + + """ + if text: + text = TextBaseParser._group_rows(text, row_tol=row_tol) + elements = [len(r) for r in text] + new_cols = [ + (t.x0, t.x1) for r in text if len(r) == max(elements) for t in r + ] + cols.extend(TextBaseParser._merge_columns(sorted(new_cols))) + return cols + + @staticmethod + def _join_columns(cols, text_x_min, text_x_max): + """Makes column coordinates continuous. + + Parameters + ---------- + cols : list + List of column x-coordinate tuples. + text_x_min : int + text_y_max : int + + Returns + ------- + cols : list + Updated list of column x-coordinate tuples. + + """ + cols = sorted(cols) + cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + return cols + + def _validate_columns(self): + if self.table_areas is not None and self.columns is not None: + if len(self.table_areas) != len(self.columns): + raise ValueError("Length of table_areas and columns" " should be equal") + + def _generate_table(self, table_idx, bbox, cols, rows, **kwargs): + table = self._initialize_new_table(table_idx, bbox, cols, rows) + table = table.set_all_edges() + self.record_parse_metadata(table) + + return table + + def record_parse_metadata(self, table): + """Record data about the origin of the table.""" + super().record_parse_metadata(table) + # for plotting + table._segments = None diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py new file mode 100644 index 0000000..baebf27 --- /dev/null +++ b/camelot/parsers/hybrid.py @@ -0,0 +1,233 @@ +"""Implementation of hybrid table parser.""" + +import numpy as np +from .base import BaseParser +from .network import Network +from .lattice import Lattice +from ..utils import bboxes_overlap +from ..utils import boundaries_to_split_lines + + +class Hybrid(BaseParser): + """Defines a hybrid parser, leveraging both network and lattice parsers. + + Parameters + ---------- + table_regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. + table_areas : list, optional (default: None) + List of table area strings of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. + columns : list, optional (default: None) + List of column x-coordinates strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Split text that spans across multiple cells. + flag_size : bool, optional (default: False) + Flag text based on font size. Useful to detect + super/subscripts. Adds around flagged text. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + edge_tol : int, optional (default: 50) + Tolerance parameter for extending textedges vertically. + row_tol : int, optional (default: 2) + Tolerance parameter used to combine text vertically, + to generate rows. + column_tol : int, optional (default: 0) + Tolerance parameter used to combine text horizontally, + to generate columns. + + """ + + def __init__( + self, + table_regions=None, + table_areas=None, + columns=None, + flag_size=False, + split_text=False, + strip_text="", + edge_tol=None, + row_tol=2, + column_tol=0, + debug=False, + **kwargs + ): + super().__init__( + "hybrid", + table_regions=table_regions, + table_areas=table_areas, + flag_size=flag_size, + split_text=split_text, + strip_text=strip_text, + debug=debug, + ) + self.columns = columns # Columns settings impacts the hybrid table + self.network_parser = Network( + table_regions=table_regions, + table_areas=table_areas, + columns=columns, + flag_size=flag_size, + split_text=split_text, + strip_text=strip_text, + edge_tol=edge_tol, + row_tol=row_tol, + column_tol=column_tol, + debug=debug, + ) + self.lattice_parser = Lattice( + table_regions=table_regions, + table_areas=table_areas, + flag_size=flag_size, + split_text=split_text, + strip_text=strip_text, + edge_tol=edge_tol, + row_tol=row_tol, + column_tol=column_tol, + debug=debug, + ) + + def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwargs): + super().prepare_page_parse( + filename, layout, dimensions, page_idx, layout_kwargs + ) + self.network_parser.prepare_page_parse( + filename, layout, dimensions, page_idx, layout_kwargs + ) + self.lattice_parser.prepare_page_parse( + filename, layout, dimensions, page_idx, layout_kwargs + ) + + def _generate_columns_and_rows(self, bbox, table_idx): + parser = self.table_bbox_parses[bbox] + return parser._generate_columns_and_rows(bbox, table_idx) + + def _generate_table(self, table_idx, bbox, cols, rows, **kwargs): + parser = self.table_bbox_parses[bbox] + table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs) + # Because hybrid can inject extraneous splits from both lattice and + # network, remove lines / cols that are completely empty. + table.df = table.df.replace("", np.nan) + table.df = table.df.dropna(axis=0, how="all") + table.df = table.df.dropna(axis=1, how="all") + table.df = table.df.replace(np.nan, "") + table.shape = table.df.shape + return table + + @staticmethod + def _augment_boundaries_with_splits(boundaries, splits, tolerance=0): + """ Augment existing boundaries using provided hard splits. + + Boundaries: |---| |-| |---------| + Splits: | | | | + Augmented: |-------|-----|-------|--| + """ + idx_boundaries = len(boundaries) - 1 + idx_splits = len(splits) - 1 + previous_boundary = None + while True: + if idx_splits < 0: + # No more splits to incorporate, we're done + break + split = splits[idx_splits] + + if idx_boundaries < 0: + # Need to insert remaining splits + new_boundary = [split, boundaries[0][0]] + boundaries.insert(0, new_boundary) + idx_splits = idx_splits - 1 + else: + boundary = \ + boundaries[idx_boundaries] + if boundary[1] < \ + split + tolerance: + # The lattice column is further to the right of our + # col boundary. We move our left boundary to match. + boundary[1] = split + # And if there was another segment after, we make its + # right boundary match as well so that there's no gap + if previous_boundary is not None: + previous_boundary[0] = split + idx_splits = idx_splits - 1 + elif boundary[0] > \ + split - tolerance: + # Our boundary is fully after the split, move on + idx_boundaries = idx_boundaries - 1 + previous_boundary = boundary + if idx_boundaries < 0: + # If this is the last boundary to the left, set its + # edge at the split + boundary[0] = split + idx_splits = idx_splits - 1 + else: + # The split is inside our boundary: split it + new_boundary = [split, boundary[1]] + boundaries.insert(idx_boundaries + 1, new_boundary) + boundary[1] = split + previous_boundary = new_boundary + idx_splits = idx_splits - 1 + return boundaries + + def _merge_bbox_analysis(self, lattice_bbox, network_bbox): + """ Identify splits that were only detected by lattice or by network + """ + lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox] + lattice_cols = lattice_parse["col_anchors"] + + network_bbox_data = self.network_parser.table_bbox_parses[network_bbox] + network_cols_boundaries = network_bbox_data["cols_boundaries"] + + # Favor network, but complete or adjust its columns based on the + # splits identified by lattice. + if network_cols_boundaries is None: + self.table_bbox_parses[lattice_bbox] = self.lattice_parser + else: + network_cols_boundaries = self._augment_boundaries_with_splits( + network_cols_boundaries, lattice_cols, self.lattice_parser.joint_tol + ) + augmented_bbox = ( + network_cols_boundaries[0][0], + min(lattice_bbox[1], network_bbox[1]), + network_cols_boundaries[-1][1], + max(lattice_bbox[3], network_bbox[3]), + ) + network_bbox_data["cols_anchors"] = \ + boundaries_to_split_lines(network_cols_boundaries) + + del self.network_parser.table_bbox_parses[network_bbox] + self.network_parser.table_bbox_parses[augmented_bbox] = network_bbox_data + self.table_bbox_parses[augmented_bbox] = self.network_parser + + def _generate_table_bbox(self): + # Collect bboxes from both parsers + self.lattice_parser._generate_table_bbox() + _lattice_bboxes = sorted( + self.lattice_parser.table_bbox_parses, key=lambda bbox: (bbox[0], -bbox[1]) + ) + self.network_parser._generate_table_bbox() + _network_bboxes = sorted( + self.network_parser.table_bbox_parses, key=lambda bbox: (bbox[0], -bbox[1]) + ) + + # Merge the data from both processes + for lattice_bbox in _lattice_bboxes: + merged = False + + for idx in range(len(_network_bboxes) - 1, -1, -1): + network_bbox = _network_bboxes[idx] + if not bboxes_overlap(lattice_bbox, network_bbox): + continue + self._merge_bbox_analysis(lattice_bbox, network_bbox) + # network_bbox_data["cols_boundaries"] + del _network_bboxes[idx] + merged = True + if not merged: + self.table_bbox_parses[lattice_bbox] = self.lattice_parser + + # Add the bboxes from network that haven't been merged + for network_bbox in _network_bboxes: + self.table_bbox_parses[network_bbox] = self.network_parser diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 5d3c771..a6adc26 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -1,33 +1,23 @@ import copy -import locale -import logging import os -import sys import warnings - import numpy as np import pandas as pd from ..backends.image_conversion import BACKENDS -from ..core import Table from ..image_processing import adaptive_threshold from ..image_processing import find_contours from ..image_processing import find_joints from ..image_processing import find_lines -from ..utils import compute_accuracy -from ..utils import compute_whitespace -from ..utils import get_table_index +from ..utils import build_file_path_in_temp_dir from ..utils import merge_close_lines from ..utils import scale_image from ..utils import scale_pdf from ..utils import segments_in_bbox -from ..utils import text_in_bbox +from ..utils import text_in_bbox_per_axis from .base import BaseParser -logger = logging.getLogger("camelot") - - class Lattice(BaseParser): """Lattice method of parsing looks for lines between text to parse the table. @@ -95,7 +85,7 @@ def __init__( process_background=False, line_scale=15, copy_text=None, - shift_text=["l", "t"], + shift_text=None, split_text=False, flag_size=False, strip_text="", @@ -108,12 +98,13 @@ def __init__( backend="ghostscript", **kwargs, ): + super().__init__("lattice") self.table_regions = table_regions self.table_areas = table_areas self.process_background = process_background self.line_scale = line_scale self.copy_text = copy_text - self.shift_text = shift_text + self.shift_text = shift_text or ["l", "t"] self.split_text = split_text self.flag_size = flag_size self.strip_text = strip_text @@ -124,6 +115,8 @@ def __init__( self.iterations = iterations self.resolution = resolution self.backend = Lattice._get_backend(backend) + self.image_path = None + self.pdf_image = None @staticmethod def _get_backend(backend): @@ -196,38 +189,13 @@ def _reduce_index(table, idx, shift_text): indices.append((r_idx, c_idx, text)) return indices - @staticmethod - def _copy_spanning_text(table, copy_text=None): - """Copies over text in empty spanning cells. - - Parameters - ---------- - table : camelot.core.Table - copy_text : list, optional (default: None) - {'h', 'v'} - Select one or more strings from above and pass them as a list - to specify the direction in which text should be copied over - when a cell spans multiple rows or columns. - - Returns - ------- - table : camelot.core.Table - + def record_parse_metadata(self, table): + """Record data about the origin of the table """ - for f in copy_text: - if f == "h": - for i in range(len(table.cells)): - for j in range(len(table.cells[i])): - if table.cells[i][j].text.strip() == "": - if table.cells[i][j].hspan and not table.cells[i][j].left: - table.cells[i][j].text = table.cells[i][j - 1].text - elif f == "v": - for i in range(len(table.cells)): - for j in range(len(table.cells[i])): - if table.cells[i][j].text.strip() == "": - if table.cells[i][j].vspan and not table.cells[i][j].top: - table.cells[i][j].text = table.cells[i - 1][j].text - return table + super().record_parse_metadata(table) + # for plotting + table._image = self.pdf_image # Reuse the image used for calc + table._segments = (self.vertical_segments, self.horizontal_segments) def _generate_table_bbox(self): def scale_areas(areas): @@ -242,15 +210,20 @@ def scale_areas(areas): scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) return scaled_areas - self.image, self.threshold = adaptive_threshold( - self.imagename, + self.image_path = build_file_path_in_temp_dir( + os.path.basename(self.filename), ".png" + ) + self.backend.convert(self.filename, self.image_path) + + self.pdf_image, self.threshold = adaptive_threshold( + self.image_path, process_background=self.process_background, blocksize=self.threshold_blocksize, c=self.threshold_constant, ) - image_width = self.image.shape[1] - image_height = self.image.shape[0] + image_width = self.pdf_image.shape[1] + image_height = self.pdf_image.shape[0] image_width_scaler = image_width / float(self.pdf_width) image_height_scaler = image_height / float(self.pdf_height) pdf_width_scaler = self.pdf_width / float(image_width) @@ -297,46 +270,87 @@ def scale_areas(areas): areas = scale_areas(self.table_areas) table_bbox = find_joints(areas, vertical_mask, horizontal_mask) - self.table_bbox_unscaled = copy.deepcopy(table_bbox) - - self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image( + [ + self.table_bbox_parses, + self.vertical_segments, + self.horizontal_segments + ] = scale_image( table_bbox, vertical_segments, horizontal_segments, pdf_scalers ) - def _generate_columns_and_rows(self, table_idx, tk): + for bbox, parse in self.table_bbox_parses.items(): + joints = parse["joints"] + + # Merge x coordinates that are close together + line_tol = self.line_tol + # Sort the joints, make them a list of lists (instead of sets) + joints_normalized = list( + map( + lambda x: list(x), + sorted(joints, key=lambda j: - j[0]) + ) + ) + for idx in range(1, len(joints_normalized)): + x_left, x_right = \ + joints_normalized[idx - 1][0], joints_normalized[idx][0] + if x_left - line_tol <= x_right <= x_left + line_tol: + joints_normalized[idx][0] = x_left + + # Merge y coordinates that are close together + joints_normalized = sorted(joints_normalized, key=lambda j: -j[1]) + for idx in range(1, len(joints_normalized)): + y_bottom, y_top = \ + joints_normalized[idx - 1][1], joints_normalized[idx][1] + if y_bottom - line_tol <= y_top <= y_bottom + line_tol: + joints_normalized[idx][1] = y_bottom + + # TODO: check this is useful, otherwise get rid of the code + # above + parse["joints_normalized"] = joints_normalized + + cols = list(map(lambda coords: coords[0], joints)) + cols.extend([bbox[0], bbox[2]]) + rows = list(map(lambda coords: coords[1], joints)) + rows.extend([bbox[1], bbox[3]]) + + # sort horizontal and vertical segments + cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) + rows = merge_close_lines( + sorted(rows, reverse=True), + line_tol=self.line_tol + ) + parse["col_anchors"] = cols + parse["row_anchors"] = rows + + def _generate_columns_and_rows(self, bbox, user_cols): # select elements which lie within table_bbox - t_bbox = {} v_s, h_s = segments_in_bbox( - tk, self.vertical_segments, self.horizontal_segments + bbox, self.vertical_segments, self.horizontal_segments ) - t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) - t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) - - t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) - t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) - - self.t_bbox = t_bbox - - cols, rows = zip(*self.table_bbox[tk]) - cols, rows = list(cols), list(rows) - cols.extend([tk[0], tk[2]]) - rows.extend([tk[1], tk[3]]) - # sort horizontal and vertical segments - cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) - rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol) - # make grid using x and y coord of shortlisted rows and cols - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] - rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] - + self.t_bbox = text_in_bbox_per_axis( + bbox, + self.horizontal_text, + self.vertical_text + ) + parse = self.table_bbox_parses[bbox] + + cols = [ + (parse["col_anchors"][i], parse["col_anchors"][i + 1]) + for i in range(0, len(parse["col_anchors"]) - 1) + ] + rows = [ + (parse["row_anchors"][i], parse["row_anchors"][i + 1]) + for i in range(0, len(parse["row_anchors"]) - 1) + ] return cols, rows, v_s, h_s - def _generate_table(self, table_idx, cols, rows, **kwargs): + def _generate_table(self, table_idx, bbox, cols, rows, **kwargs): v_s = kwargs.get("v_s") h_s = kwargs.get("h_s") if v_s is None or h_s is None: raise ValueError(f"No segments found on {self.rootname}") - table = Table(cols, rows) + table = self._initialize_new_table(table_idx, bbox, cols, rows) # set table edges to True using ver+hor lines table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol) # set table border edges to True @@ -344,80 +358,5 @@ def _generate_table(self, table_idx, cols, rows, **kwargs): # set spanning cells to True table = table.set_span() - pos_errors = [] - # TODO: have a single list in place of two directional ones? - # sorted on x-coordinate based on reading order i.e. LTR or RTL - for direction in ["vertical", "horizontal"]: - for t in self.t_bbox[direction]: - indices, error = get_table_index( - table, - t, - direction, - split_text=self.split_text, - flag_size=self.flag_size, - strip_text=self.strip_text, - ) - if indices[0][:2] != (-1, -1): - pos_errors.append(error) - indices = Lattice._reduce_index( - table, indices, shift_text=self.shift_text - ) - for r_idx, c_idx, text in indices: - table.cells[r_idx][c_idx].text = text - accuracy = compute_accuracy([[100, pos_errors]]) - - if self.copy_text is not None: - table = Lattice._copy_spanning_text(table, copy_text=self.copy_text) - - data = table.data - table.df = pd.DataFrame(data) - table.shape = table.df.shape - - whitespace = compute_whitespace(data) - table.flavor = "lattice" - table.accuracy = accuracy - table.whitespace = whitespace - table.order = table_idx + 1 - table.page = int(os.path.basename(self.rootname).replace("page-", "")) - - # for plotting - _text = [] - _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) - _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) - table._text = _text - table._image = (self.image, self.table_bbox_unscaled) - table._segments = (self.vertical_segments, self.horizontal_segments) - table._textedges = None - + self.record_parse_metadata(table) return table - - def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): - self._generate_layout(filename, layout_kwargs) - if not suppress_stdout: - logger.info(f"Processing {os.path.basename(self.rootname)}") - - if not self.horizontal_text: - if self.images: - warnings.warn( - "{} is image-based, camelot only works on" - " text-based pages.".format(os.path.basename(self.rootname)) - ) - else: - warnings.warn(f"No tables found on {os.path.basename(self.rootname)}") - return [] - - self.backend.convert(self.filename, self.imagename) - - self._generate_table_bbox() - - _tables = [] - # sort tables based on y-coord - for table_idx, tk in enumerate( - sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) - ): - cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk) - table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s) - table._bbox = tk - _tables.append(table) - - return _tables diff --git a/camelot/parsers/network.py b/camelot/parsers/network.py new file mode 100644 index 0000000..e9d43be --- /dev/null +++ b/camelot/parsers/network.py @@ -0,0 +1,701 @@ +"""Implementation of network table parser.""" + + +import copy +import math + +import numpy as np + +from ..core import ALL_ALIGNMENTS +from ..core import HORIZONTAL_ALIGNMENTS +from ..core import VERTICAL_ALIGNMENTS +from ..core import TextAlignments +from ..utils import bbox_from_str +from ..utils import bbox_from_textlines +from ..utils import boundaries_to_split_lines +from ..utils import find_columns_boundaries +from ..utils import text_in_bbox +from ..utils import text_in_bbox_per_axis +from ..utils import textlines_overlapping_bbox +from .base import TextBaseParser + + +# maximum number of columns over which a header can spread +MAX_COL_SPREAD_IN_HEADER = 3 + +# Minimum number of textlines in a table +MINIMUM_TEXTLINES_IN_TABLE = 6 + + +def column_spread(left, right, col_anchors): + """Get the number of columns crossed by a segment [left, right].""" + index_left = 0 + while index_left < len(col_anchors) and col_anchors[index_left] < left: + index_left += 1 + index_right = index_left + while index_right < len(col_anchors) and col_anchors[index_right] < right: + index_right += 1 + + return index_right - index_left + + +def find_closest_tls(bbox, tls): + """Search for tls that are the closest but outside in all 4 directions""" + left, right, top, bottom = None, None, None, None + (bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox + for textline in tls: + if textline.x1 < bbox_left: + # Left: check it overlaps horizontally + if textline.y0 > bbox_top or textline.y1 < bbox_bottom: + continue + if left is None or left.x1 < textline.x1: + left = textline + elif bbox_right < textline.x0: + # Right: check it overlaps horizontally + if textline.y0 > bbox_top or textline.y1 < bbox_bottom: + continue + if right is None or right.x0 > textline.x0: + right = textline + else: + # Either bottom or top: must overlap vertically + if textline.x0 > bbox_right or textline.x1 < bbox_left: + continue + if textline.y1 < bbox_bottom: + # Bottom + if bottom is None or bottom.y1 < textline.y1: + bottom = textline + elif bbox_top < textline.y0: + # Top + if top is None or top.y0 > textline.y0: + top = textline + return { + "left": left, + "right": right, + "top": top, + "bottom": bottom, + } + + +def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): + """Expand a bbox vertically up by looking for plausible headers. + + The core algorithm is based on fairly strict alignment of text. It works + for the table body, but might fail on tables' headers since they tend to be + in a different font, alignment (e.g. vertical), etc. + This method evalutes the area above the table body's bbox for + characteristics of a table header: close to the top of the body, with cells + that fit within the horizontal bounds identified. + """ + new_bbox = body_bbox + (left, bottom, right, top) = body_bbox + zones = [] + + keep_searching = True + while keep_searching: + keep_searching = False + # a/ first look for the closest text element above the bbox. + # It will be the anchor for a possible new row. + closest_above = None + all_above = [] + for textline in textlines: + # higher than the table, >50% within its bounds + textline_center = 0.5 * (textline.x0 + textline.x1) + if textline.y0 > top and left < textline_center < right: + all_above.append(textline) + if closest_above is None or closest_above.y0 > textline.y0: + closest_above = textline + + if closest_above and closest_above.y0 < top + max_v_gap: + # b/ We have a candidate cell that is within the correct + # vertical band, and directly above the table. Starting from + # this anchor, we list all the textlines within the same row. + tls_in_new_row = [] + top = closest_above.y1 + pushed_up = True + while pushed_up: + pushed_up = False + # Iterate and extract elements that fit in the row + # from our list + for i in range(len(all_above) - 1, -1, -1): + textline = all_above[i] + if textline.y0 < top: + # The bottom of this element is within our row + # so we add it. + tls_in_new_row.append(textline) + all_above.pop(i) + if textline.y1 > top: + # If the top of this element raises our row's + # band, we'll need to keep on searching for + # overlapping items + top = textline.y1 + pushed_up = True + + # Get the x-ranges for all the textlines, and merge the + # x-ranges that overlap + zones = zones + list( + map(lambda textline: [textline.x0, textline.x1], tls_in_new_row) + ) + zones.sort(key=lambda z: z[0]) # Sort by left coordinate + # Starting from the right, if two zones overlap horizontally, + # merge them + merged_something = True + while merged_something: + merged_something = False + for i in range(len(zones) - 1, 0, -1): + zone_right = zones[i] + zone_left = zones[i - 1] + if zone_left[1] >= zone_right[0]: + zone_left[1] = max(zone_right[1], zone_left[1]) + zones.pop(i) + merged_something = True + + max_spread = max( + list( + map( + lambda zone: column_spread(zone[0], zone[1], col_anchors), zones + ) + ) + ) + + # Accept textlines that cross columns boundaries, as long as they + # cross less than MAX_COL_SPREAD_IN_HEADER, and half the number of + # columns. + # This is to avoid picking unrelated paragraphs. + if max_spread <= min( + MAX_COL_SPREAD_IN_HEADER, math.ceil(len(col_anchors) / 2) + ): + # Combined, the elements we've identified don't cross more + # than the authorized number of columns. + # We're trying to avoid + # 0: + # 1: + # 2: + # if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS: + new_bbox = (left, bottom, right, top) + + # At this stage we've identified a plausible row (or the + # beginning of one). + keep_searching = True + return new_bbox + + +class AlignmentCounter: + """ + For a given textline, represent all other textlines aligned with it. + + A textline can be vertically aligned with others if their bbox match on + left, right, or middle coord, and horizontally aligned if they match top, + bottom, or center coord. + + """ + + def __init__(self): + self.alignment_to_occurrences = {} + for alignment in ALL_ALIGNMENTS: + self.alignment_to_occurrences[alignment] = [] + + def __getitem__(self, key): + return self.alignment_to_occurrences[key] + + def __setitem__(self, key, value): + self.alignment_to_occurrences[key] = value + return value + + def max_alignments(self, alignment_ids=None): + """Get the alignment dimension with the max number of textlines.""" + alignment_ids = alignment_ids or self.alignment_to_occurrences.keys() + alignment_items = map( + lambda alignment_id: ( + alignment_id, + self.alignment_to_occurrences[alignment_id], + ), + alignment_ids, + ) + return max(alignment_items, key=lambda item: len(item[1])) + + def max_v(self): + """Tuple (alignment_id, textlines) of largest vertical row.""" + # Note that the horizontal alignments (left, center, right) are aligned + # vertically in a column, so max_v is calculated by looking at + # horizontal alignments. + return self.max_alignments(HORIZONTAL_ALIGNMENTS) + + def max_h(self): + """Tuple (alignment_id, textlines) of largest horizontal col.""" + return self.max_alignments(VERTICAL_ALIGNMENTS) + + def max_v_count(self): + """Returns the maximum number of alignments along + one of the vertical axis (left/right/middle). + """ + return len(self.max_v()[1]) + + def max_h_count(self): + """Returns the maximum number of alignments along + one of the horizontal axis (bottom/top/center). + """ + return len(self.max_h()[1]) + + def alignment_score(self): + """We define the alignment score of a textline as the product of the + number of aligned elements - 1. The -1 is to avoid favoring + singletons on a long line. + """ + return (self.max_v_count() - 1) * (self.max_h_count() - 1) + + +class TextNetworks(TextAlignments): + """Text elements connected by vertical AND horizontal alignments. + + The alignment dict has six keys based on the hor/vert alignments, + and each key's value is a list of camelot.core.TextAlignment objects. + """ + + def __init__(self): + super().__init__(ALL_ALIGNMENTS) + # For each textline, dictionary "alignment type" to + # "number of textlines aligned" + self._textline_to_alignments = {} + + def _update_alignment(self, alignment, coord, textline): + alignment.register_aligned_textline(textline, coord) + + def _register_all_text_lines(self, textlines): + """Add all textlines to our network repository to + identify alignments. + """ + # Identify all the alignments + for textline in textlines: + if len(textline.get_text().strip()) > 0: + self._register_textline(textline) + + def _compute_alignment_counts(self): + """Build a dictionary textline -> alignment object.""" + for align_id, textedges in self._text_alignments.items(): + for textedge in textedges: + for textline in textedge.textlines: + alignments = self._textline_to_alignments.get(textline, None) + if alignments is None: + alignments = AlignmentCounter() + self._textline_to_alignments[textline] = alignments + alignments[align_id] = textedge.textlines + + def remove_unconnected_edges(self): + """Weed out elements which are only connected to others vertically + or horizontally. There needs to be connections across both + dimensions. + """ + removed_singletons = True + while removed_singletons: + removed_singletons = False + for text_alignments in self._text_alignments.values(): + # For each alignment edge, remove items if they are singletons + # either horizontally or vertically + for text_alignment in text_alignments: + for i in range(len(text_alignment.textlines) - 1, -1, -1): + textline = text_alignment.textlines[i] + alignments = self._textline_to_alignments[textline] + if ( + alignments.max_h_count() <= 1 + or alignments.max_v_count() <= 1 + ): + del text_alignment.textlines[i] + removed_singletons = True + self._textline_to_alignments = {} + self._compute_alignment_counts() + + def most_connected_textline(self): + """Retrieve the textline that is most connected across vertical and + horizontal axis. + + """ + # Find the textline with the highest alignment score, with a tie break + # to prefer textlines further down in the table. Starting the search + # from the table's bottom allows the algo to collect data on more cells + # before going to the header, typically harder to parse. + return max( + self._textline_to_alignments.keys(), + key=lambda textline: ( + self._textline_to_alignments[textline].alignment_score(), + -textline.y0, + -textline.x0, + ), + default=None, + ) + + def compute_plausible_gaps(self): + """Evaluate plausible gaps between cells horizontally and vertically + based on the textlines aligned with the most connected textline. + + Returns + ------- + gaps_hv : tuple + (horizontal_gap, horizontal_gap) in pdf coordinate space. + + """ + # Determine the textline that has the most combined + # alignments across horizontal and vertical axis. + # It will serve as a reference axis along which to collect the average + # spacing between rows/cols. + most_aligned_tl = self.most_connected_textline() + if most_aligned_tl is None: + return None + + # Retrieve the list of textlines it's aligned with, across both + # axis + best_alignment = self._textline_to_alignments[most_aligned_tl] + __, ref_h_textlines = best_alignment.max_h() + __, ref_v_textlines = best_alignment.max_v() + if len(ref_v_textlines) <= 1 or len(ref_h_textlines) <= 1: + return None + + h_textlines = sorted( + ref_h_textlines, key=lambda textline: textline.x0, reverse=True + ) + v_textlines = sorted( + ref_v_textlines, key=lambda textline: textline.y0, reverse=True + ) + + h_gaps, v_gaps = [], [] + for i in range(1, len(v_textlines)): + v_gaps.append(v_textlines[i - 1].y0 - v_textlines[i].y0) + for i in range(1, len(h_textlines)): + h_gaps.append(h_textlines[i - 1].x0 - h_textlines[i].x0) + + if not h_gaps or not v_gaps: + return None + percentile = 75 + gaps_hv = ( + 2.0 * np.percentile(h_gaps, percentile), + 2.0 * np.percentile(v_gaps, percentile), + ) + return gaps_hv + + def search_table_body(self, gaps_hv, parse_details=None): + """Build a candidate bbox for the body of a table using network algo + + Seed the process with the textline with the highest alignment + score, then expand the bbox with textlines within threshold. + + Parameters + ---------- + gaps_hv : tuple + The maximum distance allowed to consider surrounding lines/columns + as part of the same table. + parse_details : array (optional) + Optional parameter array, in which to store extra information + to help later visualization of the table creation. + """ + # First, determine the textline that has the most combined + # alignments across horizontal and vertical axis. + # It will serve both as a starting point for the table boundary + # search, and as a way to estimate the average spacing between + # rows/cols. + most_aligned_tl = self.most_connected_textline() + + # Calculate the 75th percentile of the horizontal/vertical + # gaps between textlines. Use this as a reference for a threshold + # to not exceed while looking for table boundaries. + max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1] + + if parse_details is not None: + # Store debug info + parse_details_search = { + "max_h_gap": max_h_gap, + "max_v_gap": max_v_gap, + "iterations": [], + } + parse_details.append(parse_details_search) + else: + parse_details_search = None + + bbox = [ + most_aligned_tl.x0, + most_aligned_tl.y0, + most_aligned_tl.x1, + most_aligned_tl.y1, + ] + + # For the body of the table, we only consider cells that have + # alignments on both axis. + tls_search_space = list(self._textline_to_alignments.keys()) + # tls_search_space = [] + tls_search_space.remove(most_aligned_tl) + tls_in_bbox = [most_aligned_tl] + last_bbox = None + last_cols_bounds = [(most_aligned_tl.x0, most_aligned_tl.x1)] + while last_bbox != bbox: + if parse_details_search is not None: + # Store debug info + parse_details_search["iterations"].append(bbox) + + # Check that the closest tls are within the gaps allowed + last_bbox = bbox + cand_bbox = last_bbox.copy() + closest_tls = find_closest_tls(bbox, tls_search_space) + for direction, textline in closest_tls.items(): + if textline is None: + continue + expanded_cand_bbox = cand_bbox.copy() + + if direction == "left": + if expanded_cand_bbox[0] - textline.x1 > gaps_hv[0]: + continue + expanded_cand_bbox[0] = textline.x0 + elif direction == "right": + if textline.x0 - expanded_cand_bbox[2] > gaps_hv[0]: + continue + expanded_cand_bbox[2] = textline.x1 + elif direction == "bottom": + if expanded_cand_bbox[1] - textline.y1 > gaps_hv[1]: + continue + expanded_cand_bbox[1] = textline.y0 + elif direction == "top": + if textline.y0 - expanded_cand_bbox[3] > gaps_hv[1]: + continue + expanded_cand_bbox[3] = textline.y1 + + # If they are, see what an expanded bbox in that direction + # would contain + new_tls = text_in_bbox(expanded_cand_bbox, tls_search_space) + tls_in_new_box = new_tls + tls_in_bbox + + # And if we're expanding up or down, check that the addition + # of the new row won't reduce the number of columns. + # This happens when text covers multiple rows - that's only + # allowed in the header, treated separately. + cols_bounds = find_columns_boundaries(tls_in_new_box) + if direction in ["bottom", "top"] and len(cols_bounds) < len( + last_cols_bounds + ): + continue + + # We have an expansion candidate: register it, update the + # search space and repeat + # We use bbox_from_textlines instead of cand_bbox in case some + # overlapping textlines require a large bbox for strict fit. + bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box)) + last_cols_bounds = cols_bounds + tls_in_bbox.extend(new_tls) + for i in range(len(tls_search_space) - 1, -1, -1): + textline = tls_search_space[i] + if textline in new_tls: + del tls_search_space[i] + + if len(tls_in_bbox) >= MINIMUM_TEXTLINES_IN_TABLE: + return bbox + return None + + def generate(self, textlines): + """Generate the text edge dictionaries based on the + input textlines. + """ + self._register_all_text_lines(textlines) + self._compute_alignment_counts() + + +class Network(TextBaseParser): + """Network method of parsing looks for spaces between text + to parse the table. + + If you want to specify columns when specifying multiple table + areas, make sure that the length of both lists are equal. + + Parameters + ---------- + table_regions : list, optional (default: None) + List of page regions that may contain tables of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. + table_areas : list, optional (default: None) + List of table area strings of the form x1,y1,x2,y2 + where (x1, y1) -> left-top and (x2, y2) -> right-bottom + in PDF coordinate space. + columns : list, optional (default: None) + List of column x-coordinates strings where the coordinates + are comma-separated. + split_text : bool, optional (default: False) + Split text that spans across multiple cells. + flag_size : bool, optional (default: False) + Flag text based on font size. Useful to detect + super/subscripts. Adds around flagged text. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. + edge_tol : int, optional (default: 50) + Tolerance parameter for extending textedges vertically. + row_tol : int, optional (default: 2) + Tolerance parameter used to combine text vertically, + to generate rows. + column_tol : int, optional (default: 0) + Tolerance parameter used to combine text horizontally, + to generate columns. + + """ + + def __init__( + self, + table_regions=None, + table_areas=None, + columns=None, + flag_size=False, + split_text=False, + strip_text="", + edge_tol=None, + row_tol=2, + column_tol=0, + debug=False, + **kwargs + ): + super().__init__( + "network", + table_regions=table_regions, + table_areas=table_areas, + columns=columns, + flag_size=flag_size, + split_text=split_text, + strip_text=strip_text, + edge_tol=edge_tol, + row_tol=row_tol, + column_tol=column_tol, + debug=debug, + ) + + def _generate_table_bbox(self): + user_provided_bboxes = None + if self.table_areas is not None: + # User gave us table areas already. We will use their coordinates + # to find column anchors. + user_provided_bboxes = [] + for area_str in self.table_areas: + user_provided_bboxes.append(bbox_from_str(area_str)) + + # Take all the textlines that are not just spaces + all_textlines = [ + t + for t in self.horizontal_text + self.vertical_text + if len(t.get_text().strip()) > 0 + ] + textlines = self._apply_regions_filter(all_textlines) + + textlines_processed = {} + self.table_bbox_parses = {} + if self.parse_details is not None: + parse_details_network_searches = [] + self.parse_details["network_searches"] = parse_details_network_searches + parse_details_bbox_searches = [] + self.parse_details["bbox_searches"] = parse_details_bbox_searches + self.parse_details["col_searches"] = [] + else: + parse_details_network_searches = None + parse_details_bbox_searches = None + + while True: + # Find a bbox: either pulling from the user's or from the network + # algorithm. + + # First look for the body of the table + bbox_body = None + if user_provided_bboxes is not None: + if len(user_provided_bboxes) > 0: + bbox_body = user_provided_bboxes.pop() + else: + text_network = TextNetworks() + text_network.generate(textlines) + text_network.remove_unconnected_edges() + gaps_hv = text_network.compute_plausible_gaps() + if gaps_hv is None: + break + # return None + # edge_tol instructions override the calculated vertical gap + edge_tol_hv = ( + gaps_hv[0], + gaps_hv[1] if self.edge_tol is None else self.edge_tol, + ) + bbox_body = text_network.search_table_body( + edge_tol_hv, parse_details=parse_details_bbox_searches + ) + + if parse_details_network_searches is not None: + # Preserve the current edge calculation for debugging + parse_details_network_searches.append(copy.deepcopy(text_network)) + + if bbox_body is None: + break + + # Get all the textlines that overlap with the box, compute + # columns + tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines) + cols_boundaries = find_columns_boundaries(tls_in_bbox) + cols_anchors = boundaries_to_split_lines(cols_boundaries) + + # Unless the user gave us strict bbox_body, try to find a header + # above the body to build the full bbox. + if user_provided_bboxes is not None: + bbox_full = bbox_body + else: + # Expand the text box to fully contain the tls we found + bbox_body = bbox_from_textlines(tls_in_bbox) + + # Apply a heuristic to salvage headers which formatting might + # be off compared to the rest of the table. + bbox_full = search_header_from_body_bbox( + bbox_body, textlines, cols_anchors, gaps_hv[1] + ) + + table_parse = { + "bbox_body": bbox_body, + "cols_boundaries": cols_boundaries, + "cols_anchors": cols_anchors, + "bbox_full": bbox_full, + } + self.table_bbox_parses[bbox_full] = table_parse + + if self.parse_details is not None: + self.parse_details["col_searches"].append(table_parse) + + # Remember what textlines we processed, and repeat + for textline in tls_in_bbox: + textlines_processed[textline] = None + textlines = list( + filter(lambda textline: textline not in textlines_processed, textlines) + ) + + def _generate_columns_and_rows(self, bbox, user_cols): + # select elements which lie within table_bbox + self.t_bbox = text_in_bbox_per_axis( + bbox, self.horizontal_text, self.vertical_text + ) + + all_tls = list( + sorted( + filter( + lambda textline: len(textline.get_text().strip()) > 0, + self.t_bbox["horizontal"] + self.t_bbox["vertical"], + ), + key=lambda textline: (-textline.y0, textline.x0), + ) + ) + text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(all_tls) + # FRHTODO: + # This algorithm takes the horizontal textlines in the bbox, and groups + # them into rows based on their bottom y0. + # That's wrong: it misses the vertical items, and misses out on all + # the alignment identification work we've done earlier. + rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol) + rows = self._join_rows(rows_grouped, text_y_max, text_y_min) + + if user_cols is not None: + cols = [text_x_min] + user_cols + [text_x_max] + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + else: + parse_details = self.table_bbox_parses[bbox] + col_anchors = parse_details["cols_anchors"] + cols = list( + map( + lambda idx: [col_anchors[idx], col_anchors[idx + 1]], + range(0, len(col_anchors) - 1), + ) + ) + + return cols, rows, None, None diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 266a0e9..449a136 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -1,23 +1,18 @@ -import logging -import os import warnings -import numpy as np -import pandas as pd - from ..core import Table from ..core import TextEdges from ..utils import compute_accuracy from ..utils import compute_whitespace from ..utils import get_table_index from ..utils import text_in_bbox -from .base import BaseParser - - -logger = logging.getLogger("camelot") +from ..utils import bbox_from_str +from ..utils import bbox_from_textlines +from ..utils import text_in_bbox_per_axis +from .base import TextBaseParser -class Stream(BaseParser): +class Stream(TextBaseParser): """Stream method of parsing looks for spaces between text to parse the table. @@ -69,209 +64,20 @@ def __init__( column_tol=0, **kwargs, ): - self.table_regions = table_regions - self.table_areas = table_areas - self.columns = columns - self._validate_columns() - self.split_text = split_text - self.flag_size = flag_size - self.strip_text = strip_text - self.edge_tol = edge_tol - self.row_tol = row_tol - self.column_tol = column_tol - - @staticmethod - def _text_bbox(t_bbox): - """Returns bounding box for the text present on a page. - - Parameters - ---------- - t_bbox : dict - Dict with two keys 'horizontal' and 'vertical' with lists of - LTTextLineHorizontals and LTTextLineVerticals respectively. - - Returns - ------- - text_bbox : tuple - Tuple (x0, y0, x1, y1) in pdf coordinate space. - - """ - xmin = 0 - ymin = 0 - xmax = 0 - ymax = 0 - if len([t.x0 for direction in t_bbox for t in t_bbox[direction]]) > 0: - xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]]) - ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]]) - xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]]) - ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]]) - text_bbox = (xmin, ymin, xmax, ymax) - return text_bbox - - @staticmethod - def _group_rows(text, row_tol=2): - """Groups PDFMiner text objects into rows vertically - within a tolerance. - - Parameters - ---------- - text : list - List of PDFMiner text objects. - row_tol : int, optional (default: 2) - - Returns - ------- - rows : list - Two-dimensional list of text objects grouped into rows. - - """ - row_y = 0 - rows = [] - temp = [] - - for t in text: - # is checking for upright necessary? - # if t.get_text().strip() and all([obj.upright for obj in t._objs if - # type(obj) is LTChar]): - if t.get_text().strip(): - if not np.isclose(row_y, t.y0, atol=row_tol): - rows.append(sorted(temp, key=lambda t: t.x0)) - temp = [] - row_y = t.y0 - temp.append(t) - - rows.append(sorted(temp, key=lambda t: t.x0)) - if len(rows) > 1: - __ = rows.pop(0) # TODO: hacky - return rows - - @staticmethod - def _merge_columns(l, column_tol=0): - """Merges column boundaries horizontally if they overlap - or lie within a tolerance. - - Parameters - ---------- - l : list - List of column x-coordinate tuples. - column_tol : int, optional (default: 0) - - Returns - ------- - merged : list - List of merged column x-coordinate tuples. - - """ - merged = [] - for higher in l: - if not merged: - merged.append(higher) - else: - lower = merged[-1] - if column_tol >= 0: - if higher[0] <= lower[1] or np.isclose( - higher[0], lower[1], atol=column_tol - ): - upper_bound = max(lower[1], higher[1]) - lower_bound = min(lower[0], higher[0]) - merged[-1] = (lower_bound, upper_bound) - else: - merged.append(higher) - elif column_tol < 0: - if higher[0] <= lower[1]: - if np.isclose(higher[0], lower[1], atol=abs(column_tol)): - merged.append(higher) - else: - upper_bound = max(lower[1], higher[1]) - lower_bound = min(lower[0], higher[0]) - merged[-1] = (lower_bound, upper_bound) - else: - merged.append(higher) - return merged - - @staticmethod - def _join_rows(rows_grouped, text_y_max, text_y_min): - """Makes row coordinates continuous. - - Parameters - ---------- - rows_grouped : list - Two-dimensional list of text objects grouped into rows. - text_y_max : int - text_y_min : int - - Returns - ------- - rows : list - List of continuous row y-coordinate tuples. - - """ - row_mids = [ - sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0 - for r in rows_grouped - ] - rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] - rows.insert(0, text_y_max) - rows.append(text_y_min) - rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] - return rows - - @staticmethod - def _add_columns(cols, text, row_tol): - """Adds columns to existing list by taking into account - the text that lies outside the current column x-coordinates. - - Parameters - ---------- - cols : list - List of column x-coordinate tuples. - text : list - List of PDFMiner text objects. - ytol : int - - Returns - ------- - cols : list - Updated list of column x-coordinate tuples. - - """ - if text: - text = Stream._group_rows(text, row_tol=row_tol) - elements = [len(r) for r in text] - new_cols = [ - (t.x0, t.x1) for r in text if len(r) == max(elements) for t in r - ] - cols.extend(Stream._merge_columns(sorted(new_cols))) - return cols - - @staticmethod - def _join_columns(cols, text_x_min, text_x_max): - """Makes column coordinates continuous. - - Parameters - ---------- - cols : list - List of column x-coordinate tuples. - text_x_min : int - text_y_max : int - - Returns - ------- - cols : list - Updated list of column x-coordinate tuples. - - """ - cols = sorted(cols) - cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] - return cols - - def _validate_columns(self): - if self.table_areas is not None and self.columns is not None: - if len(self.table_areas) != len(self.columns): - raise ValueError("Length of table_areas and columns" " should be equal") + super().__init__( + "stream", + table_regions=table_regions, + table_areas=table_areas, + columns=columns, + # _validate_columns() + split_text=split_text, + flag_size=flag_size, + strip_text=strip_text, + edge_tol=edge_tol, + row_tol=row_tol, + column_tol=column_tol, + ) + self.textedges = [] def _nurminen_table_detection(self, textlines): """A general implementation of the table detection algorithm @@ -293,65 +99,57 @@ def _nurminen_table_detection(self, textlines): # guess table areas using textlines and relevant edges table_bbox = textedges.get_table_areas(textlines, relevant_textedges) # treat whole page as table area if no table areas found - if not len(table_bbox): + if not table_bbox: table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} return table_bbox + def record_parse_metadata(self, table): + """Record data about the origin of the table + """ + super().record_parse_metadata(table) + table._textedges = self.textedges + def _generate_table_bbox(self): - self.textedges = [] if self.table_areas is None: hor_text = self.horizontal_text if self.table_regions is not None: # filter horizontal text hor_text = [] - for region in self.table_regions: - x1, y1, x2, y2 = region.split(",") - x1 = float(x1) - y1 = float(y1) - x2 = float(x2) - y2 = float(y2) - region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text) - hor_text.extend(region_text) + for region_str in self.table_regions: + region_text = text_in_bbox(bbox_from_str(region_str), self.horizontal_text) + hor_text.extend(region_text) # find tables based on nurminen's detection algorithm - table_bbox = self._nurminen_table_detection(hor_text) + table_bbox_parses = self._nurminen_table_detection(hor_text) else: - table_bbox = {} - for area in self.table_areas: - x1, y1, x2, y2 = area.split(",") - x1 = float(x1) - y1 = float(y1) - x2 = float(x2) - y2 = float(y2) - table_bbox[(x1, y2, x2, y1)] = None - self.table_bbox = table_bbox + table_bbox_parses = {} + for area_str in self.table_areas: + table_bbox_parses[bbox_from_str(area_str)] = None + self.table_bbox_parses = table_bbox_parses - def _generate_columns_and_rows(self, table_idx, tk): + def _generate_columns_and_rows(self, bbox, user_cols): # select elements which lie within table_bbox - t_bbox = {} - t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text) - t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text) - - t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) - t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) - - self.t_bbox = t_bbox - - text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) - rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol) + self.t_bbox = text_in_bbox_per_axis( + bbox, + self.horizontal_text, + self.vertical_text + ) + + text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( + self.t_bbox["horizontal"] + self.t_bbox["vertical"] + ) + + rows_grouped = self._group_rows( + self.t_bbox["horizontal"], row_tol=self.row_tol) rows = self._join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] - if self.columns is not None and self.columns[table_idx] != "": - # user has to input boundary columns too - # take (0, pdf_width) by default - # similar to else condition - # len can't be 1 - cols = self.columns[table_idx].split(",") - cols = [float(c) for c in cols] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + if user_cols is not None: + cols = [text_x_min] + user_cols + [text_x_max] + cols = [ + (cols[i], cols[i + 1]) + for i in range(0, len(cols) - 1) + ] else: # calculate mode of the list of number of elements in # each row to guess the number of columns @@ -366,10 +164,10 @@ def _generate_columns_and_rows(self, table_idx, tk): # see if the list contains elements, if yes, then use # the mode after removing 1s elements = list(filter(lambda x: x != 1, elements)) - if len(elements): + if elements: ncols = max(set(elements), key=elements.count) else: - warnings.warn(f"No tables found in table area {table_idx + 1}") + warnings.warn(f"No tables found in table area {bbox}") cols = [ (t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r ] @@ -386,6 +184,7 @@ def _generate_columns_and_rows(self, table_idx, tk): if t.x0 > left and t.x1 < right ] ) + outer_text = [ t for direction in self.t_bbox @@ -395,81 +194,4 @@ def _generate_columns_and_rows(self, table_idx, tk): inner_text.extend(outer_text) cols = self._add_columns(cols, inner_text, self.row_tol) cols = self._join_columns(cols, text_x_min, text_x_max) - - return cols, rows - - def _generate_table(self, table_idx, cols, rows, **kwargs): - table = Table(cols, rows) - table = table.set_all_edges() - - pos_errors = [] - # TODO: have a single list in place of two directional ones? - # sorted on x-coordinate based on reading order i.e. LTR or RTL - for direction in ["vertical", "horizontal"]: - for t in self.t_bbox[direction]: - indices, error = get_table_index( - table, - t, - direction, - split_text=self.split_text, - flag_size=self.flag_size, - strip_text=self.strip_text, - ) - if indices[:2] != (-1, -1): - pos_errors.append(error) - for r_idx, c_idx, text in indices: - table.cells[r_idx][c_idx].text = text - accuracy = compute_accuracy([[100, pos_errors]]) - - data = table.data - table.df = pd.DataFrame(data) - table.shape = table.df.shape - - whitespace = compute_whitespace(data) - table.flavor = "stream" - table.accuracy = accuracy - table.whitespace = whitespace - table.order = table_idx + 1 - table.page = int(os.path.basename(self.rootname).replace("page-", "")) - - # for plotting - _text = [] - _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) - _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) - table._text = _text - table._image = None - table._segments = None - table._textedges = self.textedges - - return table - - def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): - self._generate_layout(filename, layout_kwargs) - base_filename = os.path.basename(self.rootname) - - if not suppress_stdout: - logger.info(f"Processing {base_filename}") - - if not self.horizontal_text: - if self.images: - warnings.warn( - f"{base_filename} is image-based, camelot only works on" - " text-based pages." - ) - else: - warnings.warn(f"No tables found on {base_filename}") - return [] - - self._generate_table_bbox() - - _tables = [] - # sort tables based on y-coord - for table_idx, tk in enumerate( - sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) - ): - cols, rows = self._generate_columns_and_rows(table_idx, tk) - table = self._generate_table(table_idx, cols, rows) - table._bbox = tk - _tables.append(table) - - return _tables + return cols, rows, None, None diff --git a/camelot/plotting.py b/camelot/plotting.py index b602cef..b2c2775 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -1,3 +1,5 @@ +from pdfminer.layout import LTTextLineVertical + try: import matplotlib.patches as patches import matplotlib.pyplot as plt @@ -6,9 +8,146 @@ else: _HAS_MPL = True +from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords) + + +def extend_axe_lim(ax, bbox, margin=10): + """Ensure the ax limits include the input bbox + """ + x0, x1 = ax.get_xlim() + y0, y1 = ax.get_ylim() + ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin)) + ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin)) + + +def draw_labeled_bbox( + ax, bbox, text, + color="black", linewidth=3, + linestyle="solid", + label_pos="top,left", + fontsize=12, +): + """Utility drawing function to draw a box with an associated text label + """ + ax.add_patch( + patches.Rectangle( + (bbox[0], bbox[1]), + bbox[2] - bbox[0], bbox[3] - bbox[1], + color=color, + linewidth=linewidth, linestyle=linestyle, + fill=False + ) + ) + + vlabel, hlabel = label_pos.split(",") + if vlabel == "top": + y = max(bbox[1], bbox[3]) + elif vlabel == "bottom": + y = min(bbox[1], bbox[3]) + else: + y = 0.5 * (bbox[1] + bbox[3]) + + # We want to draw the label outside the box (above or below) + label_align_swap = { + "top": "bottom", + "bottom": "top", + "center": "center" + } + vlabel_out_of_box = label_align_swap[vlabel] + if hlabel == "right": + x = max(bbox[0], bbox[2]) + elif hlabel == "left": + x = min(bbox[0], bbox[2]) + else: + x = 0.5 * (bbox[0] + bbox[2]) + ax.text( + x, y, + text, + fontsize=fontsize, color="black", + verticalalignment=vlabel_out_of_box, + horizontalalignment=hlabel, + bbox=dict(facecolor=color, alpha=0.1) + ) + + +def draw_pdf(table, ax): + """Draw the content of the table's source pdf into the passed subplot + Parameters + ---------- + table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) + """ + img = table.get_pdf_image() + ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1])) + + +def draw_parse_constraints(table, ax): + """Draw any user provided constraints (area, region, columns, etc) + Parameters + ---------- + table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) + """ + if table.parse_details: + zone_constraints = { + "region": "table_regions", + "area": "table_areas", + } + for zone_name, zone_id in zone_constraints.items(): + # Display a bbox per region / area + for zone_str in table.parse_details[zone_id] or []: + draw_labeled_bbox( + ax, bbox_from_str(zone_str), + "{zone_name}: ({zone_str})".format( + zone_name=zone_name, + zone_str=zone_str + ), + color="purple", + linestyle="dotted", + linewidth=1, + label_pos="bottom,right" + ) + + +def draw_text(table, ax): + """Draw text, horizontal in blue, vertical in red + Parameters + ---------- + table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) + """ + bbox = bbox_from_textlines(table.textlines) + for t in table.textlines: + color = "red" if isinstance(t, LTTextLineVertical) else "blue" + ax.add_patch( + patches.Rectangle( + (t.x0, t.y0), t.x1 - t.x0, t.y1 - t.y0, color=color, alpha=0.2 + ) + ) + extend_axe_lim(ax, bbox) + + +def prepare_plot(table, ax=None): + """Initialize plot and draw common components + Parameters + ---------- + table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) + + Returns + ------- + ax : matplotlib.axes.Axes + """ + if ax is None: + fig = plt.figure() + ax = fig.add_subplot(111, aspect="equal") + draw_pdf(table, ax) + draw_parse_constraints(table, ax) + return ax + class PlotMethods: - def __call__(self, table, kind="text", filename=None): + def __call__(self, table, kind="text", filename=None, ax=None): """Plot elements found on PDF page based on kind specified, useful for debugging and playing with different parameters to get the best output. @@ -18,7 +157,8 @@ def __call__(self, table, kind="text", filename=None): table: camelot.core.Table A Camelot Table. kind : str, optional (default: 'text') - {'text', 'grid', 'contour', 'joint', 'line'} + {'text', 'grid', 'contour', 'joint', 'line', + 'network_table_search'} The element type for which a plot should be generated. filepath: str, optional (default: None) Absolute path for saving the generated plot. @@ -33,57 +173,51 @@ def __call__(self, table, kind="text", filename=None): if table.flavor == "lattice" and kind in ["textedge"]: raise NotImplementedError(f"Lattice flavor does not support kind='{kind}'") - elif table.flavor == "stream" and kind in ["joint", "line"]: - raise NotImplementedError(f"Stream flavor does not support kind='{kind}'") + if table.flavor != "lattice" and kind in ["line"]: + raise NotImplementedError(f"{table.flavor} flavor does not support kind='{kind}'") plot_method = getattr(self, kind) - fig = plot_method(table) - if filename is not None: + fig = plot_method(table, ax) fig.savefig(filename) return None - return fig + return plot_method(table, ax) - def text(self, table): + def text(self, table, ax=None): """Generates a plot for all text elements present on the PDF page. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") - xs, ys = [], [] - for t in table._text: - xs.extend([t[0], t[2]]) - ys.extend([t[1], t[3]]) - ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1])) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) - return fig - - def grid(self, table): + ax = prepare_plot(table, ax) + draw_text(table, ax) + return ax.get_figure() + + @staticmethod + def grid(table, ax=None): """Generates a plot for the detected table grids on the PDF page. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") + ax = prepare_plot(table, ax) for row in table.cells: for cell in row: if cell.left: @@ -94,130 +228,253 @@ def grid(self, table): ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) if cell.bottom: ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) - return fig + return ax.get_figure() - def contour(self, table): + @staticmethod + def contour(table, ax=None): """Generates a plot for all table boundaries present on the PDF page. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - try: - img, table_bbox = table._image - _FOR_LATTICE = True - except TypeError: - img, table_bbox = (None, {table._bbox: None}) - _FOR_LATTICE = False - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") - xs, ys = [], [] - if not _FOR_LATTICE: - for t in table._text: - xs.extend([t[0], t[2]]) - ys.extend([t[1], t[3]]) - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue" - ) - ) + _FOR_LATTICE = table.flavor == "lattice" + ax = prepare_plot(table, ax) - for t in table_bbox.keys(): - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red" - ) + if not _FOR_LATTICE: + draw_text(table, ax) + + ax.add_patch( + patches.Rectangle( + (table._bbox[0], table._bbox[1]), + table._bbox[2] - table._bbox[0], + table._bbox[3] - table._bbox[1], + fill=False, color="red" ) - if not _FOR_LATTICE: - xs.extend([t[0], t[2]]) - ys.extend([t[1], t[3]]) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) - if _FOR_LATTICE: - ax.imshow(img) - return fig + ) - def textedge(self, table): + if not _FOR_LATTICE: + extend_axe_lim(ax, table._bbox) + return ax.get_figure() + + @staticmethod + def textedge(table, ax=None): """Generates a plot for relevant textedges. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") - xs, ys = [], [] - for t in table._text: - xs.extend([t[0], t[2]]) - ys.extend([t[1], t[3]]) - ax.add_patch( - patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue") - ) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) + ax = prepare_plot(table, ax) + draw_text(table, ax) + + if table.flavor == "network": + for network in table.parse_details["network_searches"]: + most_connected_tl = network.most_connected_textline() - for te in table._textedges: - ax.plot([te.x, te.x], [te.y0, te.y1]) + ax.add_patch( + patches.Rectangle( + (most_connected_tl.x0, most_connected_tl.y0), + most_connected_tl.x1 - most_connected_tl.x0, + most_connected_tl.y1 - most_connected_tl.y0, + color="red", + alpha=0.5 + ) + ) + for tl in sorted( + network._textline_to_alignments.keys(), + key=lambda textline: (-textline.y0, textline.x0), + ): + alignments = network._textline_to_alignments[tl] + coords = get_textline_coords(tl) + alignment_id_h, tls_h = alignments.max_v() + alignment_id_v, tls_v = alignments.max_h() + xs = list(map(lambda tl: tl.x0, tls_v)) + ys = list(map(lambda tl: tl.y1, tls_h)) + top_h = max(ys) + ax.text( + coords[alignment_id_h], + top_h + 5, + "{max_h_count}".format(max_h_count=len(tls_h)), + verticalalignment="bottom", + horizontalalignment="center", + fontsize=8, + color="green" + ) + ax.plot( + [coords[alignment_id_h]] * len(ys), ys, + color="green", + linestyle="solid", + linewidth=1, + marker="o", + markeredgecolor="green", + fillstyle=None, + markersize=4, + alpha=0.8 + ) - return fig + left_v = min(map(lambda tl: tl.x0, tls_v)) + ax.text( + left_v - 5, + coords[alignment_id_v], + "{max_v_count}".format(max_v_count=len(tls_v)), + verticalalignment="center", + horizontalalignment="right", + fontsize=8, + color="blue" + ) + ax.plot( + xs, [coords[alignment_id_v]] * len(xs), + color="blue", + linestyle="solid", + linewidth=1, + marker="o", + markeredgecolor="blue", + fillstyle="full", + markersize=3, + alpha=0.8 + ) + else: + for te in table._textedges: + ax.plot([te.coord, te.coord], [te.y0, te.y1]) + return ax.get_figure() - def joint(self, table): + @staticmethod + def joint(table, ax=None): """Generates a plot for all line intersections present on the PDF page. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - img, table_bbox = table._image - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") + ax = prepare_plot(table, ax) x_coord = [] y_coord = [] - for k in table_bbox.keys(): - for coord in table_bbox[k]: - x_coord.append(coord[0]) - y_coord.append(coord[1]) + for coord in table.parse["joints"]: + x_coord.append(coord[0]) + y_coord.append(coord[1]) ax.plot(x_coord, y_coord, "ro") - ax.imshow(img) - return fig + return ax.get_figure() - def line(self, table): + @staticmethod + def line(table, ax=None): """Generates a plot for all line segments present on the PDF page. Parameters ---------- table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) Returns ------- fig : matplotlib.fig.Figure """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect="equal") + ax = prepare_plot(table, ax) vertical, horizontal = table._segments for v in vertical: ax.plot([v[0], v[2]], [v[1], v[3]]) for h in horizontal: ax.plot([h[0], h[2]], [h[1], h[3]]) - return fig + return ax.get_figure() + + @staticmethod + def network_table_search(table, ax=None): + """Generates a plot illustrating the steps of the network table search. + Parameters + ---------- + table : camelot.core.Table + ax : matplotlib.axes.Axes (optional) + Returns + ------- + fig : matplotlib.fig.Figure + """ + ax = prepare_plot(table, ax) + if table.parse_details is None: + return ax.get_figure() + parse_details = table.parse_details + for box_id, bbox_search in enumerate(parse_details["bbox_searches"]): + max_h_gap = bbox_search["max_h_gap"] + max_v_gap = bbox_search["max_v_gap"] + iterations = bbox_search["iterations"] + for iteration, bbox in enumerate(iterations): + final = iteration == len(iterations) - 1 + + draw_labeled_bbox( + ax, bbox, + "t{box_id}/i{iteration}".format( + box_id=box_id, + iteration=iteration + ), + color="red", + linewidth=5 if final else 2, + fontsize=14 if final else 8, + label_pos="bottom,left" + ) + + ax.add_patch( + patches.Rectangle( + (bbox[0] - max_h_gap, bbox[1] - max_v_gap), + bbox[2] - bbox[0] + 2 * max_h_gap, + bbox[3] - bbox[1] + 2 * max_v_gap, + color="orange", + linestyle="dotted", + fill=False + ) + ) + + for box_id, col_search in enumerate(parse_details["col_searches"]): + draw_labeled_bbox( + ax, col_search["bbox_full"], + "box body + header #{box_id}".format( + box_id=box_id + ), + color="red", + linewidth=4, + label_pos="top,left" + ) + draw_labeled_bbox( + ax, col_search["bbox_body"], + "box body #{box_id}".format( + box_id=box_id + ), + color="cyan", + linewidth=2, + label_pos="bottom,right" + ) + for col_anchor in col_search["cols_anchors"]: + # Display a green line at the col boundary line throughout the + # table bbox. + ax.plot( + [col_anchor, col_anchor], + [ + col_search["bbox_body"][1], + col_search["bbox_body"][3], + ], + color="green" + ) + + return ax.get_figure() diff --git a/camelot/utils.py b/camelot/utils.py index fda56f5..d6d6e5c 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,3 +1,4 @@ +import atexit import os import random import re @@ -15,6 +16,7 @@ from urllib.request import urlopen import numpy as np +import pandas as pd from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams from pdfminer.layout import LTAnno @@ -94,8 +96,22 @@ def download_url(url): return filepath -stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] -lattice_kwargs = [ +common_kwargs = [ + "flag_size", + "margins", + "split_text", + "strip_text", + "table_areas", + "table_regions", + "backend" +] +text_kwargs = common_kwargs + [ + "columns", + "edge_tol", + "row_tol", + "column_tol" +] +lattice_kwargs = common_kwargs + [ "process_background", "line_scale", "copy_text", @@ -107,42 +123,67 @@ def download_url(url): "iterations", "resolution", ] +flavor_to_kwargs = { + "stream": text_kwargs, + "network": text_kwargs, + "lattice": lattice_kwargs, + "hybrid": text_kwargs + lattice_kwargs, +} def validate_input(kwargs, flavor="lattice"): - def check_intersection(parser_kwargs, input_kwargs): - isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) - if isec: - raise ValueError( - f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'" + parser_kwargs = flavor_to_kwargs[flavor] + # s.difference(t): new set with elements in s but not in t + isec = set(kwargs.keys()).difference(set(parser_kwargs)) + if isec: + raise ValueError( + "{} cannot be used with flavor='{}'".format( + ",".join(sorted(isec)), flavor ) - - if flavor == "lattice": - check_intersection(stream_kwargs, kwargs) - else: - check_intersection(lattice_kwargs, kwargs) + ) def remove_extra(kwargs, flavor="lattice"): - if flavor == "lattice": - for key in kwargs.keys(): - if key in stream_kwargs: - kwargs.pop(key) - else: - for key in kwargs.keys(): - if key in lattice_kwargs: - kwargs.pop(key) + parser_kwargs = flavor_to_kwargs[flavor] + # Avoid "dictionary changed size during iteration" + kwargs_keys = list(kwargs.keys()) + for key in kwargs_keys: + if key not in parser_kwargs: + kwargs.pop(key) return kwargs - # https://stackoverflow.com/a/22726782 +# and https://stackoverflow.com/questions/10965479 class TemporaryDirectory: def __enter__(self): self.name = tempfile.mkdtemp() + # Only delete the temporary directory upon + # program exit. + atexit.register(shutil.rmtree, self.name) return self.name def __exit__(self, exc_type, exc_value, traceback): - shutil.rmtree(self.name) + pass + + +def build_file_path_in_temp_dir(filename, extension=None): + """Generates a new path within a temporary directory + + Parameters + ---------- + filename : str + extension : str + + Returns + ------- + file_path_in_temporary_dir : str + + """ + with TemporaryDirectory() as temp_dir: + if extension: + filename = filename + extension + path = os.path.join(temp_dir, filename) + return path def translate(x1, x2): @@ -248,8 +289,9 @@ def scale_image(tables, v_segments, h_segments, factors): j_x, j_y = zip(*tables[k]) j_x = [scale(j, scaling_factor_x) for j in j_x] j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y] - joints = zip(j_x, j_y) - tables_new[(x1, y1, x2, y2)] = joints + tables_new[(x1, y1, x2, y2)] = { + "joints": list(zip(j_x, j_y)) + } v_segments_new = [] for v in v_segments: @@ -340,8 +382,82 @@ def segments_in_bbox(bbox, v_segments, h_segments): return v_s, h_s +def get_textline_coords(textline): + """Calculate the coordinates of each alignment for a given textline. + """ + return { + "left": textline.x0, + "right": textline.x1, + "middle": (textline.x0 + textline.x1) / 2.0, + "bottom": textline.y0, + "top": textline.y1, + "center": (textline.y0 + textline.y1) / 2.0, + } + + +def bbox_from_str(bbox_str): + """Deserialize bbox from string ("x1,y1,x2,y2") to tuple (x1, y1, x2, y2). + + Parameters + ---------- + bbox_str : str + Serialized bbox with comma separated coordinates, "x1,y1,x2,y2". + + Returns + ------- + bbox : tuple + Tuple (x1, y1, x2, y2). + + """ + x1, y1, x2, y2 = bbox_str.split(",") + x1 = float(x1) + y1 = float(y1) + x2 = float(x2) + y2 = float(y2) + return ( + min(x1, x2), + min(y1, y2), + max(x1, x2), + max(y1, y2) + ) + + +def bboxes_overlap(bbox1, bbox2): + (left1, bottom1, right1, top1) = bbox1 + (left2, bottom2, right2, top2) = bbox2 + return ((left1 < left2 < right1) or (left1 < right2 < right1)) and ( + (bottom1 < bottom2 < top1) or (bottom1 < top2 < top1) + ) + + +def textlines_overlapping_bbox(bbox, textlines): + """Returns all text objects which overlap or are within a bounding box. + + Parameters + ---------- + bbox : tuple + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate + space. + textlines : List of PDFMiner text objects. + + Returns + ------- + t_bbox : list + List of PDFMiner text objects. + + """ + t_bbox = [ + t + for t in textlines + if bboxes_overlap(bbox, (t.x0, t.y0, t.x1, t.y1)) + ] + return t_bbox + + def text_in_bbox(bbox, text): - """Returns all text objects present inside a bounding box. + """Returns all text objects which lie at least 80% inside a bounding box + across both dimensions. Parameters ---------- @@ -383,6 +499,69 @@ def text_in_bbox(bbox, text): return unique_boxes +def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text): + """Returns all text objects present inside a bounding box, split between + horizontal and vertical text. + Parameters + ---------- + bbox : tuple + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate + space. + horizontal_text : List of PDFMiner text objects. + vertical_text : List of PDFMiner text objects. + Returns + ------- + t_bbox : dict + Dict of lists of PDFMiner text objects that lie inside table, with one + key each for "horizontal" and "vertical" + """ + t_bbox = {} + t_bbox["horizontal"] = text_in_bbox(bbox, horizontal_text) + t_bbox["vertical"] = text_in_bbox(bbox, vertical_text) + t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0)) + t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0)) + return t_bbox + + +def expand_bbox_with_textline(bbox, textline): + """Expand (if needed) a bbox so that it fits the parameter textline. + """ + return ( + min(bbox[0], textline.x0), + min(bbox[1], textline.y0), + max(bbox[2], textline.x1), + max(bbox[3], textline.y1) + ) + + +def bbox_from_textlines(textlines): + """Returns the smallest bbox containing all the text objects passed as + a parameters. + Parameters + ---------- + textlines : List of PDFMiner text objects. + Returns + ------- + bbox : tuple + Tuple (x1, y1, x2, y2) representing a bounding box where + (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate + space. + """ + if len(textlines) == 0: + return None + bbox = ( + textlines[0].x0, + textlines[0].y0, + textlines[0].x1, + textlines[0].y1 + ) + + for tl in textlines[1:]: + bbox = expand_bbox_with_textline(bbox, tl) + return bbox + + def bbox_intersection_area(ba, bb) -> float: """Returns area of the intersection of the bounding boxes of two PDFMiner objects. @@ -442,6 +621,136 @@ def bbox_intersect(ba, bb) -> bool: return ba.x1 >= bb.x0 and bb.x1 >= ba.x0 and ba.y1 >= bb.y0 and bb.y1 >= ba.y0 +def find_columns_boundaries(tls, min_gap=1.0): + """Make a list of disjunct cols boundaries for a list of text objects + + Parameters + ---------- + tls : list of PDFMiner text object. + + min_gap : minimum distance between columns. Any elements closer than + this threshold are merged together. This is to prevent spaces between + words to be misinterpreted as boundaries. + + Returns + ------- + boundaries : list + List x-coordinates for cols. + [(1st col left, 1st col right), (2nd col left, 2nd col right), ...] + + + """ + cols_bounds = [] + tls.sort(key=lambda tl: tl.x0) + for tl in tls: + if (not cols_bounds) or cols_bounds[-1][1] + min_gap < tl.x0: + cols_bounds.append([tl.x0, tl.x1]) + else: + cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1) + return cols_bounds + + +def find_rows_boundaries(tls, min_gap=1.0): + """Make a list of disjunct rows boundaries for a list of text objects + + Parameters + ---------- + tls : list of PDFMiner text object. + + min_gap : minimum distance between rows. Any elements closer than + this threshold are merged together. + + Returns + ------- + boundaries : list + List y-coordinates for rows. + [(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...] + + """ + rows_bounds = [] + tls.sort(key=lambda tl: tl.y0) + for tl in tls: + if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0: + rows_bounds.append([tl.y0, tl.y1]) + else: + rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1) + return rows_bounds + + +def boundaries_to_split_lines(boundaries): + """Find split lines given a list of boundaries between rows or cols. + + Boundaries: [ a ] [b] [ c ] [d] + Splits: | | | | | + + Parameters + ---------- + boundaries : list + List of tuples of x- (for columns) or y- (for rows) coord boundaries. + These are the (left, right most) or (bottom, top most) coordinates. + + Returns + ------- + anchors : list + List of coordinates representing the split points, each half way + between boundaries + + """ + # From the row boundaries, identify splits by getting the mid points + # between the boundaries. + anchors = list( + map( + lambda idx: (boundaries[idx - 1][1] + boundaries[idx][0]) / 2.0, + range(1, len(boundaries)), + ) + ) + anchors.insert(0, boundaries[0][0]) + anchors.append(boundaries[-1][1]) + return anchors + + +def get_index_closest_point(point, sorted_list, fn=lambda x: x): + """Return the index of the closest point in the sorted list. + Parameters + ---------- + point : the reference sortable element to search. + sorted_list : list + fn: optional accessor function + Returns + ------- + index : int + """ + + n = len(sorted_list) + if n == 0: + return None + if n == 1: + return 0 + left = 0 + right = n - 1 + mid = 0 + if point >= fn(sorted_list[n - 1]): + return n - 1 + if point <= fn(sorted_list[0]): + return 0 + while left < right: + mid = (left + right) // 2 # find the mid + mid_val = fn(sorted_list[mid]) + if point < mid_val: + right = mid + elif point > mid_val: + left = mid + 1 + else: + return mid + if mid_val > point: + if mid > 0 and (point - fn(sorted_list[mid - 1]) < mid_val - point): + return mid - 1 + elif mid_val < point: + if mid < n - 1 and (fn(sorted_list[mid + 1]) - point < point - mid_val): + return mid + 1 + return mid + + def bbox_longer(ba, bb) -> bool: """Returns True if the bounding box of the first PDFMiner object is longer or equal to the second. diff --git a/tests/data.py b/tests/data.py index 2309ab7..69c468e 100644 --- a/tests/data.py +++ b/tests/data.py @@ -1,6 +1,10 @@ # noqa -data_stream = [ +data_hybrid = [ + [ + "", "Table: 5 Public Health Outlay 2012-13 (Budget" + " Estimates) (Rs. in 000)", "", "", "", "", "", "" + ], ["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"], ["", "", "", "", "", "Revenue &", "", ""], ["", "Medical &", "Family", "Medical &", "Family", "", "", ""], @@ -221,6 +225,10 @@ ], ] +# Hybrid includes the header because the boundaries of the table include it, +# but stream/network don't include it. +data_stream = data_hybrid[1:] + data_stream_table_rotated = [ [ "Table 21 Current use of contraception by background characteristics\u2014Continued", @@ -815,9 +823,18 @@ ], ] -data_stream_two_tables_1 = [ + +# The hybrid parser catches some additional text from the footer +data_hybrid_table_rotated = [ [ - "Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated", + "", + "Table 21 Current use of contraception by background characteristics—Continued", + "", + "", + "", + "", + "", + "", "", "", "", @@ -829,443 +846,1972 @@ "", ], [ - "by the FBI. Some persons may be arrested more than once during a year, therefore, the data in this table, in some cases,", + "", + "", + "", + "", + "", + "Modern method", "", "", "", "", "", "", + "Traditional method", + "", "", "", "", ], [ - "could represent multiple arrests of the same person. See text, this section and source]", "", "", + "Any", "", "", "", "", "", "", + "Other", + "Any", "", - ], - ["", "", "Total", "", "", "Male", "", "", "Female", ""], - [ - "Offense charged", "", - "Under 18", - "18 years", "", - "Under 18", - "18 years", + "Not", "", - "Under 18", - "18 years", + "Number", ], [ "", - "Total", - "years", - "and over", - "Total", - "years", - "and over", - "Total", - "years", - "and over", + "Any", + "modern", + "Female", + "Male", + "", + "", + "", + "Condom/", + "modern", + "traditional", + "", + "With-", + "Folk", + "currently", + "", + "of", ], [ - "Total .\n .\n . . . . . .\n . .\n . .\n . .\n . .\n . .\n . .\n . .\n . . .", - "11,062 .6", - "1,540 .0", - "9,522 .6", - "8,263 .3", - "1,071 .6", - "7,191 .7", - "2,799 .2", - "468 .3", - "2,330 .9", + "Background characteristic", + "method", + "method", + "sterilization", + "sterilization", + "Pill", + "IUD", + "Injectables", + "Nirodh", + "method", + "method", + "Rhythm", + "drawal", + "method", + "using", + "Total", + "women", ], + ["Caste/tribe", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], [ - "Violent crime . . . . . . . .\n . .\n . .\n . .\n . .\n . .", - "467 .9", - "69 .1", - "398 .8", - "380 .2", - "56 .5", - "323 .7", - "87 .7", - "12 .6", - "75 .2", + "Scheduled caste", + "74.8", + "55.8", + "42.9", + "0.9", + "9.7", + "0.0", + "0.2", + "2.2", + "0.0", + "19.0", + "11.2", + "7.4", + "0.4", + "25.2", + "100.0", + "1,363", ], - ["Murder and nonnegligent", "", "", "", "", "", "", "", "", ""], [ - "manslaughter . . . . . . . .\n. .\n. .\n. .\n. .\n.", - "10.0", - "0.9", - "9.1", - "9.0", - "0.9", - "8.1", - "1.1", - "–", - "1.0", + "Scheduled tribe", + "59.3", + "39.0", + "26.8", + "0.6", + "6.4", + "0.6", + "1.2", + "3.5", + "0.0", + "20.3", + "10.4", + "5.8", + "4.1", + "40.7", + "100.0", + "256", ], [ - "Forcible rape . . . . . . . .\n. .\n. .\n. .\n. .\n. .", - "17.5", - "2.6", - "14.9", - "17.2", - "2.5", - "14.7", - "–", - "–", - "–", + "Other backward class", + "71.4", + "51.1", + "34.9", + "0.0", + "8.6", + "1.4", + "0.0", + "6.2", + "0.0", + "20.4", + "12.6", + "7.8", + "0.0", + "28.6", + "100.0", + "211", ], [ - "Robbery . . . .\n. .\n. . .\n. . .\n.\n. . .\n.\n. . .\n.\n.", - "102.1", - "25.5", - "76.6", - "90.0", - "22.9", - "67.1", - "12.1", - "2.5", - "9.5", + "Other", + "71.1", + "48.8", + "28.2", + "0.8", + "13.3", + "0.9", + "0.3", + "5.2", + "0.1", + "22.3", + "12.9", + "9.1", + "0.3", + "28.9", + "100.0", + "3,319", ], + ["Wealth index", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], [ - "Aggravated assault . . . . . . . .\n. .\n. .\n.", - "338.4", - "40.1", - "298.3", - "264.0", - "30.2", - "233.8", - "74.4", - "9.9", + "Lowest", "64.5", + "48.6", + "34.3", + "0.5", + "10.5", + "0.6", + "0.7", + "2.0", + "0.0", + "15.9", + "9.9", + "4.6", + "1.4", + "35.5", + "100.0", + "1,258", ], [ - "Property crime . . . .\n . .\n . . .\n . . .\n .\n . . . .", - "1,396 .4", - "338 .7", - "1,057 .7", - "875 .9", - "210 .8", - "665 .1", - "608 .2", - "127 .9", - "392 .6", + "Second", + "68.5", + "50.4", + "36.2", + "1.1", + "11.4", + "0.5", + "0.1", + "1.1", + "0.0", + "18.1", + "11.2", + "6.7", + "0.2", + "31.5", + "100.0", + "1,317", ], [ - "Burglary . .\n. . . . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.", - "240.9", - "60.3", - "180.6", - "205.0", - "53.4", - "151.7", - "35.9", + "Middle", + "75.5", + "52.8", + "33.6", + "0.6", + "14.2", + "0.4", + "0.5", + "3.4", + "0.1", + "22.7", + "13.4", + "8.9", + "0.4", + "24.5", + "100.0", + "1,018", + ], + [ + "Fourth", + "73.9", + "52.3", + "32.0", + "0.5", + "12.5", + "0.6", + "0.2", + "6.3", + "0.2", + "21.6", + "11.5", + "9.9", + "0.2", + "26.1", + "100.0", + "908", + ], + [ + "Highest", + "78.3", + "44.4", + "19.5", + "1.0", + "9.7", + "1.4", + "0.0", + "12.7", + "0.0", + "33.8", + "18.2", + "15.6", + "0.0", + "21.7", + "100.0", + "733", + ], + [ + "Number of living children", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + [ + "No children", + "25.1", + "7.6", + "0.3", + "0.5", + "2.0", + "0.0", + "0.0", + "4.8", + "0.0", + "17.5", + "9.0", + "8.5", + "0.0", + "74.9", + "100.0", + "563", + ], + [ + "1 child", + "66.5", + "32.1", + "3.7", + "0.7", + "20.1", + "0.7", + "0.1", "6.9", - "29.0", + "0.0", + "34.3", + "18.9", + "15.2", + "0.3", + "33.5", + "100.0", + "1,190", ], [ - "Larceny-theft . . . . . . . .\n. .\n. .\n. .\n. .\n. .", - "1,080.1", - "258.1", - "822.0", - "608.8", - "140.5", - "468.3", - "471.3", - "117.6", - "353.6", + "1 son", + "66.8", + "33.2", + "4.1", + "0.7", + "21.1", + "0.5", + "0.3", + "6.6", + "0.0", + "33.5", + "21.2", + "12.3", + "0.0", + "33.2", + "100.0", + "672", + ], + [ + "No sons", + "66.1", + "30.7", + "3.1", + "0.6", + "18.8", + "0.8", + "0.0", + "7.3", + "0.0", + "35.4", + "15.8", + "19.0", + "0.6", + "33.9", + "100.0", + "517", + ], + [ + "2 children", + "81.6", + "60.5", + "41.8", + "0.9", + "11.6", + "0.8", + "0.3", + "4.8", + "0.2", + "21.1", + "12.2", + "8.3", + "0.6", + "18.4", + "100.0", + "1,576", + ], + [ + "1 or more sons", + "83.7", + "64.2", + "46.4", + "0.9", + "10.8", + "0.8", + "0.4", + "4.8", + "0.1", + "19.5", + "11.1", + "7.6", + "0.7", + "16.3", + "100.0", + "1,268", + ], + [ + "No sons", + "73.2", + "45.5", + "23.2", + "1.0", + "15.1", + "0.9", + "0.0", + "4.8", + "0.5", + "27.7", + "16.8", + "11.0", + "0.0", + "26.8", + "100.0", + "308", + ], + [ + "3 children", + "83.9", + "71.2", + "57.7", + "0.8", + "9.8", + "0.6", + "0.5", + "1.8", + "0.0", + "12.7", + "8.7", + "3.3", + "0.8", + "16.1", + "100.0", + "961", + ], + [ + "1 or more sons", + "85.0", + "73.2", + "60.3", + "0.9", + "9.4", + "0.5", + "0.5", + "1.6", + "0.0", + "11.8", + "8.1", + "3.0", + "0.7", + "15.0", + "100.0", + "860", + ], + [ + "No sons", + "74.7", + "53.8", + "35.3", + "0.0", + "13.7", + "1.6", + "0.0", + "3.2", + "0.0", + "20.9", + "13.4", + "6.1", + "1.5", + "25.3", + "100.0", + "101", + ], + [ + "4+ children", + "74.3", + "58.1", + "45.1", + "0.6", + "8.7", + "0.6", + "0.7", + "2.4", + "0.0", + "16.1", + "9.9", + "5.4", + "0.8", + "25.7", + "100.0", + "944", + ], + [ + "1 or more sons", + "73.9", + "58.2", + "46.0", + "0.7", + "8.3", + "0.7", + "0.7", + "1.9", + "0.0", + "15.7", + "9.4", + "5.5", + "0.8", + "26.1", + "100.0", + "901", + ], + [ + "No sons", + "(82.1)", + "(57.3)", + "(25.6)", + "(0.0)", + "(17.8)", + "(0.0)", + "(0.0)", + "(13.9)", + "(0.0)", + "(24.8)", + "(21.3)", + "(3.5)", + "(0.0)", + "(17.9)", + "100.0", + "43", + ], + [ + "Total", + "71.2", + "49.9", + "32.2", + "0.7", + "11.7", + "0.6", + "0.3", + "4.3", + "0.1", + "21.3", + "12.3", + "8.4", + "0.5", + "28.8", + "100.0", + "5,234", + ], + [ + "NFHS-2 (1998-99)", + "66.6", + "47.3", + "32.0", + "1.8", + "9.2", + "1.4", + "na", + "2.9", + "na", + "na", + "8.7", + "9.8", + "na", + "33.4", + "100.0", + "4,116", + ], + [ + "NFHS-1 (1992-93)", + "57.7", + "37.6", + "26.5", + "4.3", + "3.6", + "1.3", + "0.1", + "1.9", + "na", + "na", + "11.3", + "8.3", + "na", + "42.3", + "100.0", + "3,970", + ], + [ + "", + "Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + [ + "not shown separately.", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + [ + "na = Not available", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + [ + "", + "ns = Not shown; see table 2b, footnote 1", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + [ + "( ) Based on 25-49 unweighted cases.", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], +] + + +# The streaming algorithm incorrectly includes a header in the result. +# Trimming the table for the test of network, which doesn't include it. +data_network_table_rotated = data_stream_table_rotated[1:] + + +data_stream_two_tables_1 = [ + [ + "Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + [ + "by the FBI. Some persons may be arrested more than once during a year, therefore, the data in this table, in some cases,", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + [ + "could represent multiple arrests of the same person. See text, this section and source]", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + ["", "", "Total", "", "", "Male", "", "", "Female", ""], + [ + "Offense charged", + "", + "Under 18", + "18 years", + "", + "Under 18", + "18 years", + "", + "Under 18", + "18 years", + ], + [ + "", + "Total", + "years", + "and over", + "Total", + "years", + "and over", + "Total", + "years", + "and over", + ], + [ + "Total .\n .\n . . . . . .\n . .\n . .\n . .\n . .\n . .\n . .\n . .\n . . .", + "11,062 .6", + "1,540 .0", + "9,522 .6", + "8,263 .3", + "1,071 .6", + "7,191 .7", + "2,799 .2", + "468 .3", + "2,330 .9", + ], + [ + "Violent crime . . . . . . . .\n . .\n . .\n . .\n . .\n . .", + "467 .9", + "69 .1", + "398 .8", + "380 .2", + "56 .5", + "323 .7", + "87 .7", + "12 .6", + "75 .2", + ], + ["Murder and nonnegligent", "", "", "", "", "", "", "", "", ""], + [ + "manslaughter . . . . . . . .\n. .\n. .\n. .\n. .\n.", + "10.0", + "0.9", + "9.1", + "9.0", + "0.9", + "8.1", + "1.1", + "–", + "1.0", + ], + [ + "Forcible rape . . . . . . . .\n. .\n. .\n. .\n. .\n. .", + "17.5", + "2.6", + "14.9", + "17.2", + "2.5", + "14.7", + "–", + "–", + "–", + ], + [ + "Robbery . . . .\n. .\n. . .\n. . .\n.\n. . .\n.\n. . .\n.\n.", + "102.1", + "25.5", + "76.6", + "90.0", + "22.9", + "67.1", + "12.1", + "2.5", + "9.5", + ], + [ + "Aggravated assault . . . . . . . .\n. .\n. .\n.", + "338.4", + "40.1", + "298.3", + "264.0", + "30.2", + "233.8", + "74.4", + "9.9", + "64.5", + ], + [ + "Property crime . . . .\n . .\n . . .\n . . .\n .\n . . . .", + "1,396 .4", + "338 .7", + "1,057 .7", + "875 .9", + "210 .8", + "665 .1", + "608 .2", + "127 .9", + "392 .6", + ], + [ + "Burglary . .\n. . . . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.", + "240.9", + "60.3", + "180.6", + "205.0", + "53.4", + "151.7", + "35.9", + "6.9", + "29.0", + ], + [ + "Larceny-theft . . . . . . . .\n. .\n. .\n. .\n. .\n. .", + "1,080.1", + "258.1", + "822.0", + "608.8", + "140.5", + "468.3", + "471.3", + "117.6", + "353.6", + ], + [ + "Motor vehicle theft . . . . .\n. .\n. . .\n.\n.\n. .", + "65.6", + "16.0", + "49.6", + "53.9", + "13.3", + "40.7", + "11.7", + "2.7", + "8.9", + ], + [ + "Arson .\n. . . . .\n. . .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .", + "9.8", + "4.3", + "5.5", + "8.1", + "3.7", + "4.4", + "1.7", + "0.6", + "1.1", + ], + [ + "Other assaults .\n. . . . . .\n. . .\n.\n. . .\n.\n. .\n.", + "1,061.3", + "175.3", + "886.1", + "785.4", + "115.4", + "670.0", + "276.0", + "59.9", + "216.1", + ], + [ + "Forgery and counterfeiting .\n. . . . . . .\n.", + "68.9", + "1.7", + "67.2", + "42.9", + "1.2", + "41.7", + "26.0", + "0.5", + "25.5", + ], + [ + "Fraud .\n.\n.\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n.", + "173.7", + "5.1", + "168.5", + "98.4", + "3.3", + "95.0", + "75.3", + "1.8", + "73.5", + ], + [ + "Embezzlement . . .\n. . . . .\n. . .\n.\n. . .\n.\n.\n.", + "14.6", + "–", + "14.1", + "7.2", + "–", + "6.9", + "7.4", + "–", + "7.2", + ], + [ + "Stolen property 1 . . . . . . .\n. . .\n. .\n. .\n.\n.", + "84.3", + "15.1", + "69.2", + "66.7", + "12.2", + "54.5", + "17.6", + "2.8", + "14.7", + ], + [ + "Vandalism . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.", + "217.4", + "72.7", + "144.7", + "178.1", + "62.8", + "115.3", + "39.3", + "9.9", + "29.4", + ], + [ + "Weapons; carrying, possessing, etc. .", + "132.9", + "27.1", + "105.8", + "122.1", + "24.3", + "97.8", + "10.8", + "2.8", + "8.0", + ], + [ + "Prostitution and commercialized vice", + "56.9", + "1.1", + "55.8", + "17.3", + "–", + "17.1", + "39.6", + "0.8", + "38.7", + ], + [ + "Sex offenses 2 . . . . .\n. . . . .\n. .\n. .\n. . .\n.", + "61.5", + "10.7", + "50.7", + "56.1", + "9.6", + "46.5", + "5.4", + "1.1", + "4.3", + ], + [ + "Drug abuse violations . . . . . . . .\n. .\n.\n.", + "1,333.0", + "136.6", + "1,196.4", + "1,084.3", + "115.2", + "969.1", + "248.7", + "21.4", + "227.3", + ], + [ + "Gambling .\n. . . . . .\n. .\n.\n. . .\n.\n. . .\n.\n. .\n.\n.", + "8.2", + "1.4", + "6.8", + "7.2", + "1.4", + "5.9", + "0.9", + "–", + "0.9", + ], + ["Offenses against the family and", "", "", "", "", "", "", "", "", ""], + [ + "children . . . .\n. . . .\n. .\n. .\n. .\n. .\n. .\n. . .\n.", + "92.4", + "3.7", + "88.7", + "68.9", + "2.4", + "66.6", + "23.4", + "1.3", + "22.1", + ], + [ + "Driving under the influence . . . . . .\n. .", + "1,158.5", + "109.2", + "1,147.5", + "895.8", + "8.2", + "887.6", + "262.7", + "2.7", + "260.0", + ], + [ + "Liquor laws . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .", + "48.2", + "90.2", + "368.0", + "326.8", + "55.4", + "271.4", + "131.4", + "34.7", + "96.6", + ], + [ + "Drunkenness . . .\n. . . . .\n. . .\n.\n. . .\n.\n. .\n.", + "488.1", + "11.4", + "476.8", + "406.8", + "8.5", + "398.3", + "81.3", + "2.9", + "78.4", + ], + [ + "Disorderly conduct . .\n. . . . . . .\n. .\n. .\n. .", + "529.5", + "136.1", + "393.3", + "387.1", + "90.8", + "296.2", + "142.4", + "45.3", + "97.1", + ], + [ + "Vagrancy . . . .\n. . . . .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.", + "26.6", + "2.2", + "24.4", + "20.9", + "1.6", + "19.3", + "5.7", + "0.6", + "5.1", + ], + [ + "All other offenses (except traffic) . . .\n.", + "306.1", + "263.4", + "2,800.8", + "2,337.1", + "194.2", + "2,142.9", + "727.0", + "69.2", + "657.9", + ], + [ + "Suspicion . . . .\n. . . .\n. .\n. .\n. .\n. .\n. .\n. . .\n.", + "1.6", + "–", + "1.4", + "1.2", + "–", + "1.0", + "–", + "–", + "–", + ], + [ + "Curfew and loitering law violations .\n.", + "91.0", + "91.0", + "(X)", + "63.1", + "63.1", + "(X)", + "28.0", + "28.0", + "(X)", + ], + [ + "Runaways . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.", + "75.8", + "75.8", + "(X)", + "34.0", + "34.0", + "(X)", + "41.8", + "41.8", + "(X)", + ], + [ + "", + "– Represents zero. X Not applicable. 1 Buying, receiving, possessing stolen property. 2 Except forcible rape and prostitution.", + "", + "", + "", + "", + "", + "", + "", + "", + ], +] + +# The streaming algorithm incorrectly includes a header and a footer. +# Trimming the table for the test of network, which doesn't include it. +data_network_two_tables_1 = data_stream_two_tables_1[3:-1] + + +data_stream_two_tables_2 = [ + ["Table 325. Arrests by Race: 2009", "", "", "", "", ""], + [ + "[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies", + "", + "", + "", + "", + "", + ], + [ + "with a total population of 239,839,971 as estimated by the FBI. See headnote, Table 324]", + "", + "", + "", + "", + "", + ], + ["", "", "", "", "American", ""], + ["Offense charged", "", "", "", "Indian/Alaskan", "Asian Pacific"], + ["", "Total", "White", "Black", "Native", "Islander"], + [ + "Total .\n .\n .\n .\n . .\n . . .\n . . .\n .\n . . .\n .\n . . .\n . .\n .\n . . .\n .\n .\n .\n . .\n . .\n . .", + "10,690,561", + "7,389,208", + "3,027,153", + "150,544", + "123,656", + ], + [ + "Violent crime . . . . . . . .\n . .\n . .\n . .\n . .\n .\n .\n . .\n . .\n .\n .\n .\n .\n . .", + "456,965", + "268,346", + "177,766", + "5,608", + "5,245", + ], + [ + "Murder and nonnegligent manslaughter . .\n. .\n.\n. .", + "9,739", + "4,741", + "4,801", + "100", + "97", + ], + [ + "Forcible rape . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", + "16,362", + "10,644", + "5,319", + "169", + "230", + ], + [ + "Robbery . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . .", + "100,496", + "43,039", + "55,742", + "726", + "989", + ], + [ + "Aggravated assault . . . . . . . .\n. .\n. .\n.\n.\n.\n.\n. .\n. .\n.\n.\n.", + "330,368", + "209,922", + "111,904", + "4,613", + "3,929", + ], + [ + "Property crime . . . . .\n . . . . .\n .\n . . .\n .\n . .\n .\n .\n .\n . .\n .\n . .\n .\n .", + "1,364,409", + "922,139", + "406,382", + "17,599", + "18,289", + ], + [ + "Burglary . . .\n. . . . .\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n. . . .", + "234,551", + "155,994", + "74,419", + "2,021", + "2,117", + ], + [ + "Larceny-theft . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", + "1,056,473", + "719,983", + "306,625", + "14,646", + "15,219", + ], + [ + "Motor vehicle theft . . . . . .\n. .\n.\n. . .\n.\n. .\n.\n.\n.\n. .\n.\n. .\n.", + "63,919", + "39,077", + "23,184", + "817", + "841", + ], + [ + "Arson .\n. . . .\n. .\n. .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . . . .", + "9,466", + "7,085", + "2,154", + "115", + "112", + ], + [ + "Other assaults .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n. .\n.\n.\n.\n. .\n.\n. .\n.", + "1,032,502", + "672,865", + "332,435", + "15,127", + "12,075", + ], + [ + "Forgery and counterfeiting .\n. . . . . . .\n.\n. .\n.\n.\n.\n. .\n. .\n.", + "67,054", + "44,730", + "21,251", + "345", + "728", + ], + [ + "Fraud .\n.\n. . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. . . . . . .", + "161,233", + "108,032", + "50,367", + "1,315", + "1,519", + ], + [ + "Embezzlement . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. .\n.\n. .\n.\n.\n.\n.", + "13,960", + "9,208", + "4,429", + "75", + "248", + ], + [ + "Stolen property; buying, receiving, possessing .\n. .", + "82,714", + "51,953", + "29,357", + "662", + "742", + ], + [ + "Vandalism . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n. .", + "212,173", + "157,723", + "48,746", + "3,352", + "2,352", + ], + [ + "Weapons—carrying, possessing, etc. .\n. .\n. .\n.\n. .\n. .", + "130,503", + "74,942", + "53,441", + "951", + "1,169", + ], + [ + "Prostitution and commercialized vice . .\n.\n. .\n. .\n. .\n.", + "56,560", + "31,699", + "23,021", + "427", + "1,413", + ], + [ + "Sex offenses 1 . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", + "60,175", + "44,240", + "14,347", + "715", + "873", + ], + [ + "Drug abuse violations . . . . . . . .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", + "1,301,629", + "845,974", + "437,623", + "8,588", + "9,444", + ], + [ + "Gambling . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n. .\n.\n. . .\n.\n.\n.\n.\n. .\n. .", + "8,046", + "2,290", + "5,518", + "27", + "211", + ], + [ + "Offenses against the family and children .\n.\n. .\n. .\n. .", + "87,232", + "58,068", + "26,850", + "1,690", + "624", + ], + [ + "Driving under the influence . . . . . . .\n. .\n.\n. .\n.\n.\n.\n.\n. .", + "1,105,401", + "954,444", + "121,594", + "14,903", + "14,460", + ], + [ + "Liquor laws . . . . . . . .\n. .\n. .\n. .\n. .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", + "444,087", + "373,189", + "50,431", + "14,876", + "5,591", + ], + [ + "Drunkenness . .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n.\n.\n. . .\n.\n.\n.\n.\n.\n.", + "469,958", + "387,542", + "71,020", + "8,552", + "2,844", + ], + [ + "Disorderly conduct . . .\n. . . . . .\n. .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", + "515,689", + "326,563", + "176,169", + "8,783", + "4,174", + ], + [ + "Vagrancy . . .\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . .", + "26,347", + "14,581", + "11,031", + "543", + "192", ], [ - "Motor vehicle theft . . . . .\n. .\n. . .\n.\n.\n. .", - "65.6", - "16.0", - "49.6", - "53.9", - "13.3", - "40.7", - "11.7", - "2.7", - "8.9", + "All other offenses (except traffic) . .\n. .\n. .\n. .\n.\n.\n.\n. .\n.", + "2,929,217", + "1,937,221", + "911,670", + "43,880", + "36,446", ], [ - "Arson .\n. . . . .\n. . .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .", - "9.8", - "4.3", - "5.5", - "8.1", - "3.7", - "4.4", - "1.7", - "0.6", - "1.1", + "Suspicion . . .\n. . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.\n.\n. .\n. . . .", + "1,513", + "677", + "828", + "1", + "7", ], [ - "Other assaults .\n. . . . . .\n. . .\n.\n. . .\n.\n. .\n.", - "1,061.3", - "175.3", - "886.1", - "785.4", - "115.4", - "670.0", - "276.0", - "59.9", - "216.1", + "Curfew and loitering law violations . .\n. .\n.\n. .\n. .\n.\n.\n.", + "89,578", + "54,439", + "33,207", + "872", + "1,060", ], [ - "Forgery and counterfeiting .\n. . . . . . .\n.", - "68.9", - "1.7", - "67.2", - "42.9", - "1.2", - "41.7", - "26.0", - "0.5", - "25.5", + "Runaways . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n. .", + "73,616", + "48,343", + "19,670", + "1,653", + "3,950", ], + ["1 Except forcible rape and prostitution.", "", "", "", "", ""], +] + + +# The streaming algorithm incorrectly includes a header and a footer. +# Trimming the table for the test of network, which doesn't include it. +data_network_two_tables_2 = data_stream_two_tables_2[3:-1] + +data_network_two_tables_b_1 = [ + ["1", "Ghfhbdhj", "1", "Hgfdhgjsdhjdsf"], + ["Vgvhgh", "Hj", "Hj", "Hj"], + ["Hj", "Hj", "Hj", "Hj"], + ["Hj", "Hj", "J", "Hj"], + ["V", "C", "D", "Gfhj"], + ["Hjb", "B", "Jhbh", "Hj"], + ["Hjdhshj", "Hjhjhh", "Ddnj", "dsxv"], +] + +data_network_two_tables_b_2 = [ + ["Trtrt", "H", "Gh"], + ["Gh", "V", "Hv"], + ["Hv", "Bhjb", "hg"], +] + +# The streaming algorithm incorrectly includes a header and a footer. +# Trimming the table for the test of network, which doesn't include it. +data_network_two_tables_2 = data_stream_two_tables_2[3:-1] + +data_network_vertical_headers = [ [ - "Fraud .\n.\n.\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n.", - "173.7", - "5.1", - "168.5", - "98.4", - "3.3", - "95.0", - "75.3", - "1.8", - "73.5", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "Congress-", + "Senator 36th", + "Rep106th", + "", + "Reg. of", + "Road", + "", + "", + "", + "Distri", + "Dist", + "", + "", ], [ - "Embezzlement . . .\n. . . . .\n. . .\n.\n. . .\n.\n.\n.", - "14.6", - "–", - "14.1", - "7.2", - "–", - "6.9", - "7.4", - "–", - "7.2", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "1st Dist", + "Dist.", + "Dist.", + "", + "Deeds", + "", + "Commission", + "", + "District #1", + "ct #2", + "#3", + "", + "Dist #4", ], [ - "Stolen property 1 . . . . . . .\n. . .\n. .\n. .\n.\n.", - "84.3", - "15.1", - "69.2", - "66.7", - "12.2", - "54.5", - "17.6", - "2.8", - "14.7", + "", + "", + "", + "", + "", + "Governor", + "", + "", + "U.S. Senator", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", ], [ - "Vandalism . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.", - "217.4", - "72.7", - "144.7", - "178.1", - "62.8", - "115.3", - "39.3", - "9.9", - "29.4", + "", + "Number of Registered voters", + "Poll Book Totals", + "Brian Calley", + "Patrick Colbeck", + "Jim Hines", + "Bill Schuette", + "John James", + "Sandy Pensler", + "", + "Jack Bergman", + "", + "Jim Stamas", + "Sue Allor", + "Melissa A. Cordes", + "", + "Al Scully", + "", + "Daniel G. Gauthier", + "Craig M. Clemens", + "Craig Johnston", + "Carolyn Brummund", + "Adam Brege", + "David Bielusiak", ], [ - "Weapons; carrying, possessing, etc. .", - "132.9", - "27.1", - "105.8", - "122.1", - "24.3", - "97.8", - "10.8", - "2.8", - "8.0", + "Alcona", + "963", + "439", + "55", + "26", + "47", + "164", + "173", + "111", + "", + "268", + "", + "272", + "275", + "269", + "", + "271", + "", + "224", + "76", + "", + "", + "", + "", ], [ - "Prostitution and commercialized vice", - "56.9", - "1.1", - "55.8", - "17.3", - "–", - "17.1", - "39.6", - "0.8", - "38.7", + "Caledonia", + "923", + "393", + "40", + "23", + "45", + "158", + "150", + "103", + "", + "244", + "", + "247", + "254", + "255", + "", + "244", + "", + "139", + "143", + "", + "", + "", + "", ], [ - "Sex offenses 2 . . . . .\n. . . . .\n. .\n. .\n. . .\n.", - "61.5", - "10.7", - "50.7", - "56.1", - "9.6", - "46.5", - "5.4", - "1.1", - "4.3", + "Curtis", + "1026", + "349", + "30", + "30", + "25", + "102", + "95", + "84", + "", + "159", + "", + "164", + "162", + "161", + "", + "157", + "", + "", + "", + "", + "", + "", + "", ], [ - "Drug abuse violations . . . . . . . .\n. .\n.\n.", - "1,333.0", - "136.6", - "1,196.4", - "1,084.3", - "115.2", - "969.1", - "248.7", - "21.4", - "227.3", + "Greenbush", + "1212", + "423", + "56", + "26", + "40", + "126", + "104", + "131", + "", + "208", + "", + "213", + "214", + "215", + "", + "208", + "", + "", + "", + "", + "208", + "", + "", ], [ - "Gambling .\n. . . . . .\n. .\n.\n. . .\n.\n. . .\n.\n. .\n.\n.", - "8.2", - "1.4", - "6.8", - "7.2", - "1.4", - "5.9", - "0.9", - "–", - "0.9", + "Gustin", + "611", + "180", + "22", + "35", + "17", + "55", + "73", + "45", + "", + "108", + "", + "104", + "111", + "111", + "", + "109", + "", + "", + "", + "", + "", + "81", + "42", ], - ["Offenses against the family and", "", "", "", "", "", "", "", "", ""], [ - "children . . . .\n. . . .\n. .\n. .\n. .\n. .\n. .\n. . .\n.", - "92.4", - "3.7", - "88.7", - "68.9", - "2.4", - "66.6", - "23.4", - "1.3", - "22.1", + "Harrisville", + "1142", + "430", + "45", + "90", + "29", + "101", + "155", + "94", + "", + "226", + "", + "226", + "232", + "244", + "", + "226", + "", + "", + "", + "232", + "", + "", + "", ], [ - "Driving under the influence . . . . . .\n. .", - "1,158.5", - "109.2", - "1,147.5", - "895.8", - "8.2", - "887.6", - "262.7", - "2.7", - "260.0", + "Hawes", + "884", + "293", + "38", + "36", + "27", + "109", + "121", + "84", + "", + "192", + "", + "195", + "195", + "193", + "", + "184", + "", + "", + "", + "", + "", + "118", + "87", ], [ - "Liquor laws . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .", - "48.2", - "90.2", - "368.0", - "326.8", - "55.4", - "271.4", - "131.4", - "34.7", - "96.6", + "Haynes", + "626", + "275", + "31", + "20", + "32", + "104", + "121", + "53", + "", + "163", + "", + "163", + "173", + "161", + "", + "152", + "", + "", + "", + "76", + "", + "69", + "31", ], [ - "Drunkenness . . .\n. . . . .\n. . .\n.\n. . .\n.\n. .\n.", - "488.1", - "11.4", - "476.8", - "406.8", - "8.5", - "398.3", - "81.3", - "2.9", - "78.4", + "Mikado", + "781", + "208", + "19", + "39", + "17", + "81", + "90", + "63", + "", + "149", + "", + "149", + "145", + "147", + "", + "143", + "", + "", + "", + "", + "113", + "", + "", ], [ - "Disorderly conduct . .\n. . . . . . .\n. .\n. .\n. .", - "529.5", - "136.1", - "393.3", - "387.1", - "90.8", - "296.2", - "142.4", - "45.3", - "97.1", + "Millen", + "353", + "139", + "7", + "16", + "13", + "38", + "49", + "19", + "", + "62", + "", + "66", + "67", + "66", + "", + "62", + "", + "", + "", + "", + "", + "", + "", ], [ - "Vagrancy . . . .\n. . . . .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.", - "26.6", - "2.2", - "24.4", - "20.9", - "1.6", - "19.3", - "5.7", - "0.6", - "5.1", + "Mitchell", + "327", + "96", + "12", + "17", + "7", + "29", + "41", + "17", + "", + "57", + "", + "55", + "57", + "60", + "", + "56", + "", + "", + "", + "", + "", + "", + "", ], [ - "All other offenses (except traffic) . . .\n.", - "306.1", - "263.4", - "2,800.8", - "2,337.1", - "194.2", - "2,142.9", - "727.0", - "69.2", - "657.9", + "City Harrisville", + "389", + "171", + "16", + "15", + "18", + "35", + "49", + "31", + "", + "78", + "", + "80", + "82", + "81", + "", + "77", + "", + "", + "", + "73", + "", + "", + "", ], [ - "Suspicion . . . .\n. . . .\n. .\n. .\n. .\n. .\n. .\n. . .\n.", - "1.6", - "–", - "1.4", - "1.2", - "–", - "1.0", - "–", - "–", - "–", + "Totals", + "9237", + "3396", + "371", + "373", + "317", + "1102", + "1221", + "835", + "0", + "1914", + "0", + "1934", + "1967", + "1963", + "0", + "1889", + "0", + "363", + "219", + "381", + "321", + "268", + "160", ], +] + +# Compared to network, hybrid detects additional sparse columns +data_hybrid_vertical_headers = [ [ - "Curfew and loitering law violations .\n.", - "91.0", - "91.0", - "(X)", - "63.1", - "63.1", - "(X)", - "28.0", - "28.0", - "(X)", + "", "", "", "", "", "STATE", "", "", "", "CONGRESSIONAL", "", "", + "", "", "LEGISLATIVE", "", "", "COUNTY", "", "COUNTY", "", "", + "County Commissioner", "", "", "", "" ], [ - "Runaways . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.", - "75.8", - "75.8", - "(X)", - "34.0", - "34.0", - "(X)", - "41.8", - "41.8", - "(X)", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "Congress-", + "", + "", + "Senator 36th", + "", + "Rep106th", + "", + "Reg. of", + "", + "Road", + "", + "", + "Distri", + "Dist", + "", + "", + "Dist", + ], + [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "1st Dist", + "Dist.", + "", + "", + "Dist.", + "Deeds", + "", + "Commission", + "", + "District #1", + "", + "ct #2", + "#3", + "Dist #4", + "", + "#5", ], [ "", - "– Represents zero. X Not applicable. 1 Buying, receiving, possessing stolen property. 2 Except forcible rape and prostitution.", + "", + "", + "", + "", + "Governor", + "", + "", + "U.S. Senator", "", "", "", @@ -1274,288 +2820,423 @@ "", "", "", - ], -] - - -data_stream_two_tables_2 = [ - ["Table 325. Arrests by Race: 2009", "", "", "", "", ""], - [ - "[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies", "", "", "", "", "", - ], - [ - "with a total population of 239,839,971 as estimated by the FBI. See headnote, Table 324]", "", "", "", "", "", - ], - ["", "", "", "", "American", ""], - ["Offense charged", "", "", "", "Indian/Alaskan", "Asian Pacific"], - ["", "Total", "White", "Black", "Native", "Islander"], - [ - "Total .\n .\n .\n .\n . .\n . . .\n . . .\n .\n . . .\n .\n . . .\n . .\n .\n . . .\n .\n .\n .\n . .\n . .\n . .", - "10,690,561", - "7,389,208", - "3,027,153", - "150,544", - "123,656", - ], - [ - "Violent crime . . . . . . . .\n . .\n . .\n . .\n . .\n .\n .\n . .\n . .\n .\n .\n .\n .\n . .", - "456,965", - "268,346", - "177,766", - "5,608", - "5,245", - ], - [ - "Murder and nonnegligent manslaughter . .\n. .\n.\n. .", - "9,739", - "4,741", - "4,801", - "100", - "97", - ], - [ - "Forcible rape . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", - "16,362", - "10,644", - "5,319", - "169", - "230", - ], - [ - "Robbery . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . .", - "100,496", - "43,039", - "55,742", - "726", - "989", - ], - [ - "Aggravated assault . . . . . . . .\n. .\n. .\n.\n.\n.\n.\n. .\n. .\n.\n.\n.", - "330,368", - "209,922", - "111,904", - "4,613", - "3,929", - ], - [ - "Property crime . . . . .\n . . . . .\n .\n . . .\n .\n . .\n .\n .\n .\n . .\n .\n . .\n .\n .", - "1,364,409", - "922,139", - "406,382", - "17,599", - "18,289", - ], - [ - "Burglary . . .\n. . . . .\n. . .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n. . . .", - "234,551", - "155,994", - "74,419", - "2,021", - "2,117", - ], - [ - "Larceny-theft . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", - "1,056,473", - "719,983", - "306,625", - "14,646", - "15,219", - ], - [ - "Motor vehicle theft . . . . . .\n. .\n.\n. . .\n.\n. .\n.\n.\n.\n. .\n.\n. .\n.", - "63,919", - "39,077", - "23,184", - "817", - "841", - ], - [ - "Arson .\n. . . .\n. .\n. .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . . . .", - "9,466", - "7,085", - "2,154", - "115", - "112", - ], - [ - "Other assaults .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n. .\n.\n.\n.\n. .\n.\n. .\n.", - "1,032,502", - "672,865", - "332,435", - "15,127", - "12,075", - ], - [ - "Forgery and counterfeiting .\n. . . . . . .\n.\n. .\n.\n.\n.\n. .\n. .\n.", - "67,054", - "44,730", - "21,251", - "345", - "728", - ], - [ - "Fraud .\n.\n. . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. . . . . . .", - "161,233", - "108,032", - "50,367", - "1,315", - "1,519", - ], - [ - "Embezzlement . . . .\n. . . . .\n.\n. . .\n.\n. . .\n.\n.\n. .\n.\n. .\n.\n.\n.\n.", - "13,960", - "9,208", - "4,429", - "75", - "248", - ], - [ - "Stolen property; buying, receiving, possessing .\n. .", - "82,714", - "51,953", - "29,357", - "662", - "742", - ], - [ - "Vandalism . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n. .", - "212,173", - "157,723", - "48,746", - "3,352", - "2,352", - ], - [ - "Weapons—carrying, possessing, etc. .\n. .\n. .\n.\n. .\n. .", - "130,503", - "74,942", - "53,441", - "951", - "1,169", ], [ - "Prostitution and commercialized vice . .\n.\n. .\n. .\n. .\n.", - "56,560", - "31,699", - "23,021", - "427", - "1,413", + "", + "Number of Registered voters", + "Poll Book Totals", + "Brian Calley", + "Patrick Colbeck", + "Jim Hines", + "Bill Schuette", + "John James", + "Sandy Pensler", + "", + "Jack Bergman", + "", + "Jim Stamas", + "", + "Sue Allor", + "", + "Melissa A. Cordes", + "", + "Al Scully", + "", + "Daniel G. Gauthier", + "Craig M. Clemens", + "Craig Johnston", + "Carolyn Brummund", + "Adam Brege", + "David Bielusiak", + "", ], [ - "Sex offenses 1 . . . . . . . .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n.\n. .", - "60,175", - "44,240", - "14,347", - "715", - "873", + "Alcona", + "963", + "439", + "55", + "26", + "47", + "164", + "173", + "111", + "", + "268", + "", + "272", + "", + "275", + "", + "269", + "", + "271", + "", + "224", + "76", + "", + "", + "", + "", + "", ], [ - "Drug abuse violations . . . . . . . .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", - "1,301,629", - "845,974", - "437,623", - "8,588", - "9,444", + "Caledonia", + "923", + "393", + "40", + "23", + "45", + "158", + "150", + "103", + "", + "244", + "", + "247", + "", + "254", + "", + "255", + "", + "244", + "", + "139", + "143", + "", + "", + "", + "", + "", ], [ - "Gambling . . . . .\n. . . . .\n.\n. . .\n.\n. . .\n. .\n.\n. . .\n.\n.\n.\n.\n. .\n. .", - "8,046", - "2,290", - "5,518", - "27", - "211", + "Curtis", + "1026", + "349", + "30", + "30", + "25", + "102", + "95", + "84", + "", + "159", + "", + "164", + "", + "162", + "", + "161", + "", + "157", + "", + "", + "", + "", + "", + "", + "", + "", ], [ - "Offenses against the family and children .\n.\n. .\n. .\n. .", - "87,232", - "58,068", - "26,850", - "1,690", - "624", + "Greenbush", + "1212", + "423", + "56", + "26", + "40", + "126", + "104", + "131", + "", + "208", + "", + "213", + "", + "214", + "", + "215", + "", + "208", + "", + "", + "", + "", + "208", + "", + "", + "", ], [ - "Driving under the influence . . . . . . .\n. .\n.\n. .\n.\n.\n.\n.\n. .", - "1,105,401", - "954,444", - "121,594", - "14,903", - "14,460", + "Gustin", + "611", + "180", + "22", + "35", + "17", + "55", + "73", + "45", + "", + "108", + "", + "104", + "", + "111", + "", + "111", + "", + "109", + "", + "", + "", + "", + "", + "81", + "42", + "", ], [ - "Liquor laws . . . . . . . .\n. .\n. .\n. .\n. .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", - "444,087", - "373,189", - "50,431", - "14,876", - "5,591", + "Harrisville", + "1142", + "430", + "45", + "90", + "29", + "101", + "155", + "94", + "", + "226", + "", + "226", + "", + "232", + "", + "244", + "", + "226", + "", + "", + "", + "232", + "", + "", + "", + "", ], [ - "Drunkenness . .\n. . . . . . .\n.\n. . .\n.\n. . .\n.\n.\n.\n. . .\n.\n.\n.\n.\n.\n.", - "469,958", - "387,542", - "71,020", - "8,552", - "2,844", + "Hawes", + "884", + "293", + "38", + "36", + "27", + "109", + "121", + "84", + "", + "192", + "", + "195", + "", + "195", + "", + "193", + "", + "184", + "", + "", + "", + "", + "", + "118", + "87", + "", ], [ - "Disorderly conduct . . .\n. . . . . .\n. .\n. . .\n.\n.\n.\n. .\n. .\n.\n.\n.\n.", - "515,689", - "326,563", - "176,169", - "8,783", - "4,174", + "Haynes", + "626", + "275", + "31", + "20", + "32", + "104", + "121", + "53", + "", + "163", + "", + "163", + "", + "173", + "", + "161", + "", + "152", + "", + "", + "", + "76", + "", + "69", + "31", + "", ], [ - "Vagrancy . . .\n. .\n. . . .\n. .\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. .\n.\n.\n. . . .", - "26,347", - "14,581", - "11,031", - "543", - "192", + "Mikado", + "781", + "208", + "19", + "39", + "17", + "81", + "90", + "63", + "", + "149", + "", + "149", + "", + "145", + "", + "147", + "", + "143", + "", + "", + "", + "", + "113", + "", + "", + "", ], [ - "All other offenses (except traffic) . .\n. .\n. .\n. .\n.\n.\n.\n. .\n.", - "2,929,217", - "1,937,221", - "911,670", - "43,880", - "36,446", + "Millen", + "353", + "139", + "7", + "16", + "13", + "38", + "49", + "19", + "", + "62", + "", + "66", + "", + "67", + "", + "66", + "", + "62", + "", + "", + "", + "", + "", + "", + "", + "", ], [ - "Suspicion . . .\n. . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n.\n.\n. .\n. . . .", - "1,513", - "677", - "828", - "1", + "Mitchell", + "327", + "96", + "12", + "17", "7", + "29", + "41", + "17", + "", + "57", + "", + "55", + "", + "57", + "", + "60", + "", + "56", + "", + "", + "", + "", + "", + "", + "", + "", ], [ - "Curfew and loitering law violations . .\n. .\n.\n. .\n. .\n.\n.\n.", - "89,578", - "54,439", - "33,207", - "872", - "1,060", + "City Harrisville", + "389", + "171", + "16", + "15", + "18", + "35", + "49", + "31", + "", + "78", + "", + "80", + "", + "82", + "", + "81", + "", + "77", + "", + "", + "", + "73", + "", + "", + "", + "", ], [ - "Runaways . . . . . . . .\n. .\n. .\n. .\n. .\n. .\n. .\n.\n.\n. .\n. .\n.\n.\n.\n. .", - "73,616", - "48,343", - "19,670", - "1,653", - "3,950", + "Totals", + "9237", + "3396", + "371", + "373", + "317", + "1102", + "1221", + "835", + "0", + "1914", + "0", + "1934", + "", + "1967", + "", + "1963", + "0", + "1889", + "0", + "363", + "219", + "381", + "321", + "268", + "160", + "0", ], - ["1 Except forcible rape and prostitution.", "", "", "", "", ""], ] @@ -1573,6 +3254,9 @@ ["(each day of the payroll period)", ""], ] +# Network doesn't recognize the footer as belonging to the table. +data_network_table_regions = data_stream_table_areas[:-1] + data_stream_columns = [ [ "Clave", @@ -2060,6 +3744,14 @@ ] +# The stream algorithm excludes the string "Alphabetic Listing by type" +data_network_split_text = [] +data_network_split_text.extend(data_stream_split_text) +data_network_split_text[0] = [ + 'FEB', 'RUAR', 'Y 2014 M27 (BUS)', '', + 'ALPHABETIC LISTING BY T', 'YPE', '', '', '', 'ABLPDM27' +] + data_stream_flag_size = [ [ "States", @@ -2279,6 +3971,57 @@ ], ] + +# Network adds more content into the header. +data_network_flag_size = [ + ['', '', '', '', '(As at end-March)', '', '', '', '', '', ''], + ['', '', '', '', '', '', '', '', '', '', '(` Billion)'] +] +data_network_flag_size.extend(data_stream_flag_size) + +data_network_strip_text = [ + ["VinsauVerre", ""], + ["LesBlancs", "12.5CL"], + ["A.O.PCôtesduRhône", ""], + ["DomainedelaGuicharde«Autourdelachapelle»2016", "8€"], + ["A.O.PVacqueyras", ""], + ["DomainedeMontvac«Melodine»2016", "10€"], + ["A.O.PChâteauneufduPape", ""], + ["DomainedeBeaurenard2017", "13€"], + ["A.O.PCôteauxduLanguedoc", ""], + ["VillaTempora«Untempspourelle»2014", "9€"], + ["A.O.PCôtesdeProvence", ""], + ["ChâteauGrandBoise2017", "9€"], + ["LesRosés", "125CL"], + ["A.O.PCôtesduRhône", ""], + ["DomainedelaFlorane«AfleurdePampre»2016", "8€"], + ["FamilleCoulon(DomaineBeaurenard)Biotifulfox2017", "8€"], + ["A.O.PVacqueyras", ""], + ["DomainedeMontvac2017", "9€"], + ["A.O.PLanguedoc", ""], + ["DomainedeJoncas«Nébla»2015", "8€"], + ["VillaTempora«L’arroseurarrosé»2015", "9€"], + ["A.O.PCôtesdeProvence", ""], + ["ChâteauGrandBoise«SainteVictoire»2017", "9€"], + ["ChâteauLéoube2016", "10€"], + ["LesRouges", "12CL"], + ["A.O.PCôtesduRhône", ""], + ["DomainedeDionysos«LaCigalette»", "8€"], + ["ChâteauSaintEstèved’Uchaux«GrandeRéserve»2014", "9€"], + ["DomainedelaGuicharde«CuvéeMassillan»2016", "9€"], + ["DomainedelaFlorane«TerrePourpre»2014", "10€"], + ["L’OratoireStMartin«RéservedesSeigneurs»2015", "11€"], + ["A.O.PSaintJoseph", ""], + ["DomaineMonierPerréol«Châtelet»2015", "13€"], + ["A.O.PChâteauneufduPape", ""], + ["DomainedeBeaurenard2011", "15€"], + ["A.O.PCornas", ""], + ["DomaineLionnet«TerreBrûlée»2012", "15€"], +] + +# Stream only detects part of the table + + data_stream_strip_text = [ ["VinsauVerre", ""], ["LesBlancs", "12.5CL"], @@ -2342,6 +4085,12 @@ ["period.", ""], ] +# The stream algorithm ends up including a footer, which network correctly +# skips. + +data_network_edge_tol = data_stream_edge_tol[:-3] + + data_lattice = [ [ "Cycle \nName", @@ -2593,6 +4342,52 @@ ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"], ] +data_hybrid_process_background = [ + [ + "State", + "Date", + "Halt", + "Halt", + "Persons", + "Persons", + "Persons", + "Persons", + ], + ["", "", "stations", "days", "directly", "trained", "counseled", "tested"], + ["", "", "", "", "reached", "", "", "for HIV"], + ["", "", "", "", "(in lakh)", "", "", ""], + ["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"], + ["Rajasthan", "2.12.2009 to", "", "", "", "", "", ""], + ["", "19.12.2009", "", "", "", "", "", ""], + ["Gujarat", "20.12.2009 to", "6", "13", "6.03", "3,810", "2,317", "1,453"], + ["", "3.1.2010", "", "", "", "", "", ""], + [ + "Maharashtra", + "4.01.2010 to", + "13", + "26", + "1.27", + "5,680", + "9,027", + "4,153", + ], + ["", "1.2.2010", "", "", "", "", "", ""], + [ + "Karnataka", + "2.2.2010 to", + "11", + "19", + "1.80", + "5,741", + "3,658", + "3,183", + ], + ["", "22.2.2010", "", "", "", "", "", ""], + ["Kerala", "23.2.2010 to", "9", "17", "1.42", "3,559", "2,173", "855"], + ["", "11.3.2010", "", "", "", "", "", ""], + ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"], +] + data_lattice_copy_text = [ ["Plan Type", "County", "Plan Name", "Totals"], ["GMC", "Sacramento", "Anthem Blue Cross", "164,380"], diff --git a/tests/files/baseline_plots/test_hybrid_contour_plot.png b/tests/files/baseline_plots/test_hybrid_contour_plot.png new file mode 100644 index 0000000..d781439 Binary files /dev/null and b/tests/files/baseline_plots/test_hybrid_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_grid_plot.png b/tests/files/baseline_plots/test_hybrid_grid_plot.png new file mode 100644 index 0000000..b04a2f1 Binary files /dev/null and b/tests/files/baseline_plots/test_hybrid_grid_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_textedge_plot.png b/tests/files/baseline_plots/test_hybrid_textedge_plot.png new file mode 100644 index 0000000..1c04473 Binary files /dev/null and b/tests/files/baseline_plots/test_hybrid_textedge_plot.png differ diff --git a/tests/files/baseline_plots/test_line_plot_ghostscript.png b/tests/files/baseline_plots/test_line_plot_ghostscript.png index 12c44c0..6ddeace 100644 Binary files a/tests/files/baseline_plots/test_line_plot_ghostscript.png and b/tests/files/baseline_plots/test_line_plot_ghostscript.png differ diff --git a/tests/files/baseline_plots/test_stream_contour_plot.png b/tests/files/baseline_plots/test_stream_contour_plot.png index 958ea0a..d781439 100644 Binary files a/tests/files/baseline_plots/test_stream_contour_plot.png and b/tests/files/baseline_plots/test_stream_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_grid_plot.png b/tests/files/baseline_plots/test_stream_grid_plot.png new file mode 100644 index 0000000..b04a2f1 Binary files /dev/null and b/tests/files/baseline_plots/test_stream_grid_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_textedge_plot.png b/tests/files/baseline_plots/test_stream_textedge_plot.png new file mode 100644 index 0000000..1c04473 Binary files /dev/null and b/tests/files/baseline_plots/test_stream_textedge_plot.png differ diff --git a/tests/files/baseline_plots/test_text_plot.png b/tests/files/baseline_plots/test_text_plot.png index 63b5520..497af37 100644 Binary files a/tests/files/baseline_plots/test_text_plot.png and b/tests/files/baseline_plots/test_text_plot.png differ diff --git a/tests/files/baseline_plots/test_textedge_plot.png b/tests/files/baseline_plots/test_textedge_plot.png index 1de4e9c..1c04473 100644 Binary files a/tests/files/baseline_plots/test_textedge_plot.png and b/tests/files/baseline_plots/test_textedge_plot.png differ diff --git a/tests/files/multiple_tables.pdf b/tests/files/multiple_tables.pdf new file mode 100644 index 0000000..0b9d854 Binary files /dev/null and b/tests/files/multiple_tables.pdf differ diff --git a/tests/files/vertical_header.pdf b/tests/files/vertical_header.pdf new file mode 100644 index 0000000..a343305 Binary files /dev/null and b/tests/files/vertical_header.pdf differ diff --git a/tests/test_cli.py b/tests/test_cli.py index 2357eae..1cb05bb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -98,6 +98,44 @@ def test_cli_parallel(testdir): assert result.output == "Found 2 tables\n" +def test_cli_hybrid(testdir): + with TemporaryDirectory() as tempdir: + infile = os.path.join(testdir, "budget.pdf") + outfile = os.path.join(tempdir, "budget.csv") + runner = CliRunner() + result = runner.invoke( + cli, ["--format", "csv", "--output", outfile, "hybrid", infile] + ) + assert result.exit_code == 0 + assert result.output == "Found 1 tables\n" + + result = runner.invoke(cli, ["--format", "csv", "hybrid", infile]) + output_error = "Error: Please specify output file path using --output" + assert output_error in result.output + + result = runner.invoke(cli, ["--output", outfile, "hybrid", infile]) + format_error = "Please specify output file format using --format" + assert format_error in result.output + + +def test_cli_network(testdir): + with TemporaryDirectory() as tempdir: + infile = os.path.join(testdir, "budget.pdf") + outfile = os.path.join(tempdir, "budget.csv") + runner = CliRunner() + result = runner.invoke( + cli, ["--format", "csv", "--output", outfile, "network", infile] + ) + assert result.exit_code == 0 + assert result.output == "Found 1 tables\n" + result = runner.invoke(cli, ["--format", "csv", "network", infile]) + output_error = "Error: Please specify output file path using --output" + assert output_error in result.output + result = runner.invoke(cli, ["--output", outfile, "network", infile]) + format_error = "Please specify output file format using --format" + assert format_error in result.output + + def test_cli_password(testdir): with TemporaryDirectory() as tempdir: infile = os.path.join(testdir, "health_protected.pdf") @@ -216,6 +254,7 @@ def test_cli_quiet(testdir): def test_cli_lattice_plot_type(): with TemporaryDirectory() as tempdir: + outfile = os.path.join(tempdir, "lattice_contour.png") runner = CliRunner() - result = runner.invoke(cli, ["--plot_type", ""]) + result = runner.invoke(cli, ["--plot_type", "contour", "--output", outfile, "--format", "--format", "png"]) assert result.exit_code != 0, f"Output: {result.output}" diff --git a/tests/test_common.py b/tests/test_common.py index c324317..8ac196b 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -49,7 +49,6 @@ def test_password(testdir): tables = camelot.read_pdf(filename, password="userpass", flavor="stream") assert_frame_equal(df, tables[0].df) - @skip_pdftopng def test_repr_poppler(testdir): filename = os.path.join(testdir, "foo.pdf") diff --git a/tests/test_errors.py b/tests/test_errors.py index 41262c1..06b7d64 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -9,7 +9,7 @@ def test_unknown_flavor(foo_pdf): - message = "Unknown flavor specified." " Use either 'lattice' or 'stream'" + message = "Unknown flavor specified." " Use either 'lattice', 'stream', 'network' or 'hybrid'" with pytest.raises(NotImplementedError, match=message): camelot.read_pdf(foo_pdf, flavor="chocolate") @@ -104,7 +104,7 @@ def test_stream_no_tables_in_area(testdir): warnings.simplefilter("error") with pytest.raises(UserWarning) as e: tables = camelot.read_pdf(filename, flavor="stream") - assert str(e.value) == "No tables found in table area 1" + assert str(e.value) == "No tables found in table area (0, 0, 792, 612)" def test_lattice_no_tables_on_page(testdir): diff --git a/tests/test_hybrid.py b/tests/test_hybrid.py new file mode 100644 index 0000000..980f9c5 --- /dev/null +++ b/tests/test_hybrid.py @@ -0,0 +1,136 @@ +import os + +import pandas as pd +from pandas.testing import assert_frame_equal + +import camelot + +from .data import * + + +def test_hybrid(testdir): + df = pd.DataFrame(data_hybrid) + + filename = os.path.join(testdir, "health.pdf") + tables = camelot.read_pdf(filename, flavor="hybrid") + assert_frame_equal(df, tables[0].df) + + +def test_hybrid_table_rotated(testdir): + df = pd.DataFrame(data_hybrid_table_rotated) + + filename = os.path.join(testdir, "clockwise_table_2.pdf") + tables = camelot.read_pdf(filename, flavor="hybrid") + assert_frame_equal(df, tables[0].df) + + filename = os.path.join(testdir, "anticlockwise_table_2.pdf") + tables = camelot.read_pdf(filename, flavor="hybrid") + assert_frame_equal(df, tables[0].df) + + +def test_hybrid_two_tables(testdir): + df1 = pd.DataFrame(data_network_two_tables_1) + df2 = pd.DataFrame(data_network_two_tables_2) + + filename = os.path.join(testdir, "tabula/12s0324.pdf") + tables = camelot.read_pdf(filename, flavor="hybrid") + + assert len(tables) == 2 + assert df1.equals(tables[0].df) + assert df2.equals(tables[1].df) + +def test_hybrid_vertical_header(testdir): + """Tests a complex table with a vertically text header. + """ + df = pd.DataFrame(data_hybrid_vertical_headers) + + filename = os.path.join(testdir, "vertical_header.pdf") + tables = camelot.read_pdf(filename, flavor="hybrid") + assert len(tables) == 1 + assert_frame_equal(df, tables[0].df) + + +def test_hybrid_process_background(testdir): + df = pd.DataFrame(data_hybrid_process_background) + + filename = os.path.join(testdir, "background_lines_1.pdf") + tables = camelot.read_pdf( + filename, flavor="hybrid", process_background=True) + assert_frame_equal(df, tables[1].df) + + +def test_hybrid_table_regions(testdir): + df = pd.DataFrame(data_network_table_regions) + + filename = os.path.join(testdir, "tabula/us-007.pdf") + tables = camelot.read_pdf( + filename, flavor="hybrid", table_regions=["320,335,573,505"] + ) + assert_frame_equal(df, tables[0].df) + + +def test_hybrid_table_areas(testdir): + df = pd.DataFrame(data_stream_table_areas) + + filename = os.path.join(testdir, "tabula/us-007.pdf") + tables = camelot.read_pdf( + filename, flavor="hybrid", table_areas=["320,500,573,335"] + ) + assert_frame_equal(df, tables[0].df) + + +def test_hybrid_columns(testdir): + df = pd.DataFrame(data_stream_columns) + + filename = os.path.join(testdir, "mexican_towns.pdf") + tables = camelot.read_pdf( + filename, flavor="hybrid", columns=["67,180,230,425,475"], row_tol=10 + ) + assert_frame_equal(df, tables[0].df) + + +def test_hybrid_split_text(testdir): + df = pd.DataFrame(data_network_split_text) + + filename = os.path.join(testdir, "tabula/m27.pdf") + tables = camelot.read_pdf( + filename, + flavor="hybrid", + columns=["72,95,209,327,442,529,566,606,683"], + split_text=True, + ) + assert_frame_equal(df, tables[0].df) + + +def test_hybrid_flag_size(testdir): + df = pd.DataFrame(data_network_flag_size) + + filename = os.path.join(testdir, "superscript.pdf") + tables = camelot.read_pdf(filename, flavor="hybrid", flag_size=True) + assert_frame_equal(df, tables[0].df) + + +def test_hybrid_strip_text(testdir): + df = pd.DataFrame(data_network_strip_text) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf(filename, flavor="hybrid", strip_text=" ,\n") + assert_frame_equal(df, tables[0].df) + + +def test_hybrid_edge_tol(testdir): + df = pd.DataFrame(data_network_edge_tol) + + filename = os.path.join(testdir, "edge_tol.pdf") + tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500) + assert_frame_equal(df, tables[0].df) + + +def test_hybrid_layout_kwargs(testdir): + df = pd.DataFrame(data_stream_layout_kwargs) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf( + filename, flavor="hybrid", layout_kwargs={"detect_vertical": False} + ) + assert_frame_equal(df, tables[0].df) diff --git a/tests/test_network.py b/tests/test_network.py new file mode 100644 index 0000000..609001d --- /dev/null +++ b/tests/test_network.py @@ -0,0 +1,143 @@ +import os + +import pandas as pd +from pandas.testing import assert_frame_equal + +import camelot + +from .data import * + + +# this one does not increase coverage +def test_network(testdir): + df = pd.DataFrame(data_stream) + + filename = os.path.join(testdir, "health.pdf") + tables = camelot.read_pdf(filename, flavor="network") + assert_frame_equal(df, tables[0].df) + + +def test_network_table_rotated(testdir): + df = pd.DataFrame(data_network_table_rotated) + + filename = os.path.join(testdir, "clockwise_table_2.pdf") + tables = camelot.read_pdf(filename, flavor="network") + assert_frame_equal(df, tables[0].df) + + filename = os.path.join(testdir, "anticlockwise_table_2.pdf") + tables = camelot.read_pdf(filename, flavor="network") + assert_frame_equal(df, tables[0].df) + + +def test_network_two_tables_a(testdir): + df1 = pd.DataFrame(data_network_two_tables_1) + df2 = pd.DataFrame(data_network_two_tables_2) + + filename = os.path.join(testdir, "tabula/12s0324.pdf") + tables = camelot.read_pdf(filename, flavor="network") + # tables = camelot.read_pdf(filename, flavor="hybrid") # temp try hybrid + + assert len(tables) == 2 + assert df1.equals(tables[0].df) + assert df2.equals(tables[1].df) + + +# Reported as https://github.com/camelot-dev/camelot/issues/132 +def test_network_two_tables_b(testdir): + df1 = pd.DataFrame(data_network_two_tables_b_1) + df2 = pd.DataFrame(data_network_two_tables_b_2) + filename = os.path.join(testdir, "multiple_tables.pdf") + tables = camelot.read_pdf(filename, flavor="network") # temp try hybrid + + assert len(tables) == 2 + assert df1.equals(tables[0].df) + assert df2.equals(tables[1].df) + + +def test_network_vertical_header(testdir): + """Tests a complex table with a vertically text header. + """ + df = pd.DataFrame(data_network_vertical_headers) + filename = os.path.join(testdir, "vertical_header.pdf") + tables = camelot.read_pdf(filename, flavor="network") + assert len(tables) == 1 + assert_frame_equal(df, tables[0].df) + + +def test_network_table_regions(testdir): + df = pd.DataFrame(data_network_table_regions) + + filename = os.path.join(testdir, "tabula/us-007.pdf") + # The "stream" test looks for a region in ["320,460,573,335"], which + # should exclude the header. + tables = camelot.read_pdf( + filename, flavor="network", table_regions=["320,335,573,505"] + ) + assert_frame_equal(df, tables[0].df) + + +def test_network_table_areas(testdir): + df = pd.DataFrame(data_stream_table_areas) + + filename = os.path.join(testdir, "tabula/us-007.pdf") + tables = camelot.read_pdf( + filename, flavor="network", table_areas=["320,500,573,335"] + ) + assert_frame_equal(df, tables[0].df) + + +def test_network_columns(testdir): + df = pd.DataFrame(data_stream_columns) + + filename = os.path.join(testdir, "mexican_towns.pdf") + tables = camelot.read_pdf( + filename, flavor="network", columns=["67,180,230,425,475"], row_tol=10 + ) + assert_frame_equal(df, tables[0].df) + + +def test_network_split_text(testdir): + df = pd.DataFrame(data_network_split_text) + + filename = os.path.join(testdir, "tabula/m27.pdf") + tables = camelot.read_pdf( + filename, + flavor="network", + columns=["72,95,209,327,442,529,566,606,683"], + split_text=True, + ) + assert_frame_equal(df, tables[0].df) + + +def test_network_flag_size(testdir): + df = pd.DataFrame(data_network_flag_size) + + filename = os.path.join(testdir, "superscript.pdf") + tables = camelot.read_pdf(filename, flavor="network", flag_size=True) + assert_frame_equal(df, tables[0].df) + + +def test_network_strip_text(testdir): + df = pd.DataFrame(data_network_strip_text) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf(filename, flavor="network", strip_text=" ,\n") + assert_frame_equal(df, tables[0].df) + + +def test_network_edge_tol(testdir): + df = pd.DataFrame(data_network_edge_tol) + + filename = os.path.join(testdir, "edge_tol.pdf") + tables = camelot.read_pdf(filename, flavor="network", edge_tol=500) + assert_frame_equal(df, tables[0].df) + + +def test_network_layout_kwargs(testdir): + df = pd.DataFrame(data_stream_layout_kwargs) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf( + filename, flavor="network", layout_kwargs={"detect_vertical": False} + ) + assert_frame_equal(df, tables[0].df) diff --git a/tests/test_plotting.py b/tests/test_plotting.py index 17dc9ad..d44d072 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -14,7 +14,7 @@ def test_text_plot(testdir): tables = camelot.read_pdf(filename) return camelot.plot(tables[0], kind="text") - +@skip_on_windows @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) def test_textedge_plot(testdir): filename = os.path.join(testdir, "tabula/12s0324.pdf") @@ -37,13 +37,70 @@ def test_lattice_contour_plot_ghostscript(testdir): tables = camelot.read_pdf(filename, backend="ghostscript") return camelot.plot(tables[0], kind="contour") - +@skip_on_windows @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) def test_stream_contour_plot(testdir): filename = os.path.join(testdir, "tabula/12s0324.pdf") tables = camelot.read_pdf(filename, flavor="stream") return camelot.plot(tables[0], kind="contour") +@skip_on_windows +@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) +def test_stream_grid_plot(testdir): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + return camelot.plot(tables[0], kind="grid") + +@skip_on_windows +@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) +def test_network_grid_plot(testdir): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename, flavor="network") + return camelot.plot(tables[0], kind="grid") + +@skip_on_windows +@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) +def test_network_contour_plot(testdir): + filename = os.path.join(testdir, "tabula/12s0324.pdf") + tables = camelot.read_pdf(filename, flavor="network") + return camelot.plot(tables[0], kind="contour") + +@skip_on_windows +@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) +def test_network_textedge_plot(testdir): + filename = os.path.join(testdir, "tabula/12s0324.pdf") + tables = camelot.read_pdf(filename, debug=True, flavor='network') + return camelot.plot(tables[0], kind="textedge") + + +@skip_on_windows +@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) +def test_network_table_regions_textedge_plot(testdir): + filename = os.path.join(testdir, "tabula/us-007.pdf") + tables = camelot.read_pdf( + filename, debug=True, flavor="network", table_regions=["320,505,573,330"] + ) + return camelot.plot(tables[0], kind="textedge") + + +@skip_on_windows +@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) +def test_network_table_areas_text_plot(testdir): + filename = os.path.join(testdir, "tabula/us-007.pdf") + tables = camelot.read_pdf( + filename, debug=True, flavor="network", table_areas=["320,500,573,335"] + ) + return camelot.plot(tables[0], kind="text") + +@skip_on_windows +@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) +def test_network_table_search_plot(testdir): + filename = os.path.join(testdir, "tabula/us-007.pdf") + tables = camelot.read_pdf( + filename, debug=True, flavor="network" + ) + return camelot.plot(tables[0], kind="network_table_search") + @skip_pdftopng @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)