Skip to content
This repository was archived by the owner on Apr 2, 2025. It is now read-only.
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
fe3660e
Refactor base classes and improve plotting
Apr 19, 2020
7e7c907
Fixup Pre-commit
bosd Sep 21, 2024
d5c29c0
Disable plot tests on windows
bosd Sep 21, 2024
05026cd
[IMP] Transparency on contourplot
bosd Sep 21, 2024
c64ca3f
Initial Hybrid parser, for now identical to Stream
Apr 19, 2020
830aa24
Refactor hybrid parser tests to own file
bosd Sep 21, 2024
5e56428
'Merge' parsers from 'hybrid-parser' branch
bosd Sep 21, 2024
c0bc6b4
Add network parser to cli (#38)
bosd Sep 21, 2024
71f2e72
Add debug mode and support for neew parsers
bosd Sep 21, 2024
fed41ba
Handlers add support for network and hybrid parsers, todo: debug support
bosd Sep 21, 2024
1ed8d05
Further changes to support new parsers
bosd Sep 22, 2024
5432d2d
Refactor core.py to support textedges and alignments
bosd Sep 22, 2024
f475874
add generic function get_index_closest_point to utils
bosd Sep 22, 2024
5458126
Update utils.py
bosd Sep 22, 2024
3964f80
Refactor/move copy_spanning_text to core
bosd Sep 22, 2024
66e7413
Finalize the base class
bosd Sep 22, 2024
0817307
Refactor to use prepare_page_parse
bosd Sep 22, 2024
54c8242
Lattice Fix rename imagename to pdf_image
bosd Sep 22, 2024
e8f2e7a
Further refactor lattice parser
bosd Sep 22, 2024
74567e3
Fixup base parser/ improvement from fork
bosd Sep 22, 2024
18da554
Refactor Stream Parser
bosd Sep 22, 2024
605fd40
Plotting.py support for axes
bosd Sep 23, 2024
609ebb0
Update 'test_unknown_flavor' to support new parsers
bosd Sep 23, 2024
08c5edd
add test_cli network parser
bosd Sep 25, 2024
f2595c8
[IMP] Multiple table support
bosd Sep 28, 2024
bd3c82d
Fixup network_table_search
bosd Sep 28, 2024
dc234db
leftovers in core.py should fix network parser
bosd Sep 28, 2024
a982b4c
imp network debug plot
bosd Sep 28, 2024
ac093d4
imp network table search green line
bosd Sep 28, 2024
71f4c69
IMP utils to support kwargs
bosd Sep 29, 2024
b6bfa11
[IMP] Hybrid tests
bosd Sep 23, 2024
15782bb
Fix test cli plot lattice
bosd Sep 29, 2024
0447d9a
[IMP] Plotting tests
bosd Sep 29, 2024
c2b255f
Sort the PDFMiner text objects along the x axis before applying the g…
bosd Oct 6, 2024
4aa2b10
Pre-commit Fixes
bosd Oct 6, 2024
27536a7
[IMP] Docstrings of cli
bosd Oct 6, 2024
c71e44e
[IMP] Docstrings of handlers
bosd Oct 6, 2024
24c0ebd
[IMP] Docstrings of hybridparser
bosd Oct 6, 2024
87b127f
[IMP] Docstrings of latticeparser
bosd Oct 6, 2024
75bf5ee
[IMP] Docstrings of networkparser
bosd Oct 6, 2024
552a1b5
Update Flake8, exclude docstring qa on tests folder
bosd Oct 6, 2024
62345c9
Update Flake8 config
bosd Oct 7, 2024
bd7b7d7
Add/Improve docstrings on backends
bosd Oct 7, 2024
44a7f90
Update noxfile qa
bosd Oct 7, 2024
fbab152
Update Docstrings on Plotting
bosd Oct 7, 2024
56ccc20
Update Docstrings on core.py
bosd Oct 7, 2024
7db4789
Update Docstrings on io.py
bosd Oct 7, 2024
3817239
Update Docstrings
bosd Oct 7, 2024
fa2ae3f
Fixup utils.py
bosd Oct 7, 2024
2ada26d
Reflect new functions and Hybrid and Network parsers in packing in py…
bosd Oct 7, 2024
96348f7
Fixup stream unused imports
bosd Oct 7, 2024
0725937
Flake8 Fixes on lattice
bosd Oct 7, 2024
fea5970
Fix B907
bosd Oct 7, 2024
b4ceb19
Improve Plotting, tests, readability, usability and cleanup
bosd Oct 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .flake8
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
[flake8]
select = B,B9,C,D,DAR,E,F,N,RST,S,W
ignore = E203,E501,RST201,RST203,RST301,W503
ignore = D401,E203,E501,RST201,RST203,RST301,RST305,W503
max-line-length = 120
max-complexity = 10
docstring-convention = numpy
per-file-ignores = tests/*:S101,S106,F403,F405,F841 pypdf_table_extraction/*:D100,D103,D104,F401,W0611,C0114
per-file-ignores = tests/*:B950,D100,D102,D103,D104,D401,D101,D200,S101,S106,F403,F405,F841
pypdf_table_extraction/*:D100,D103,D104,F401,W0611,C0114 docs/*:D100,D101
__init__.py:D100,D103,D104,F401
__version__.py:D100
__main__.py:D100,D103
rst-roles = class,const,func,meth,mod,ref
rst-directives = deprecated
2 changes: 2 additions & 0 deletions camelot/__main__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Initialize pypdf_table_extraction, formerly known as Camelot."""

__all__ = ("main",)


Expand Down
2 changes: 2 additions & 0 deletions camelot/backends/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
"""pypdf_table_extraction offers multiple backends to convert the PDFs to images so it can be analyzed by opencv."""

from .image_conversion import ImageConversionBackend
20 changes: 20 additions & 0 deletions camelot/backends/ghostscript_backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,26 @@
"""Creates a ghostscript backend class to convert a pdf to a png file."""


class GhostscriptBackend:
"""Classmethod to create GhostscriptScriptBackend."""

def convert(self, pdf_path, png_path, resolution=300):
"""Convert a PDF to a PNG image using Ghostscript .

Parameters
----------
pdf_path : str
[description]
png_path : str
[description]
resolution : int, optional
[description], by default 300

Raises
------
OSError
[description]
"""
try:
import ghostscript
except RuntimeError:
Expand Down
40 changes: 37 additions & 3 deletions camelot/backends/image_conversion.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Classes tand functions for the ImageConversionBackend backends."""

from .ghostscript_backend import GhostscriptBackend
from .poppler_backend import PopplerBackend

Expand All @@ -6,15 +8,47 @@


class ImageConversionBackend:
"""Classes the ImageConversionBackend backend."""

def __init__(self, backend="poppler", use_fallback=True):
"""Initialize the conversion backend .

Parameters
----------
backend : str, optional
[description], by default "poppler"
use_fallback : bool, optional
[description], by default True

Raises
------
ValueError
[description]
"""
if backend not in BACKENDS.keys():
raise ValueError(f"Image conversion backend '{backend}' not supported")
raise ValueError(f"Image conversion backend {backend!r} not supported")

self.backend = backend
self.use_fallback = use_fallback
self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))

def convert(self, pdf_path, png_path):
"""Convert PDF to png_path.

Parameters
----------
pdf_path : str
Path where to read the pdf file.
png_path : str
Path where to save png file.

Raises
------
type
[description]
type
[description]
"""
try:
converter = BACKENDS[self.backend]()
converter.convert(pdf_path, png_path)
Expand All @@ -28,12 +62,12 @@ def convert(self, pdf_path, png_path):
converter.convert(pdf_path, png_path)
except Exception as e:
raise type(e)(
str(e) + f" with image conversion backend '{fallback}'"
str(e) + f" with image conversion backend {fallback!r}"
).with_traceback(sys.exc_info()[2])
continue
else:
break
else:
raise type(e)(
str(e) + f" with image conversion backend '{self.backend}'"
str(e) + f" with image conversion backend {self.backend!r}"
).with_traceback(sys.exc_info()[2])
28 changes: 28 additions & 0 deletions camelot/backends/poppler_backend.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
"""Creates a poppler backend class to convert a pdf to a png file.

Raises
------
OSError
[description]
ValueError
[description]
"""

import os
import shutil
import subprocess
Expand All @@ -8,7 +18,25 @@


class PopplerBackend:
"""Classmethod to create a poplerBackendBackend class."""

def convert(self, pdf_path, png_path):
"""Convert PDF to png.

Parameters
----------
pdf_path : str
Path where to read the pdf file.
png_path : str
Path where to save png file.

Raises
------
OSError
[description]
ValueError
[description]
"""
pdftopng_executable = shutil.which("pdftopng", path=path)
if pdftopng_executable is None:
raise OSError(
Expand Down
190 changes: 189 additions & 1 deletion camelot/cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Implementation of the command line interface."""

import logging

import click
Expand All @@ -20,10 +22,14 @@


class Config:
"""Class method for creating a new class."""

def __init__(self):
"""Initialize the configuration."""
self.config = {}

def set_config(self, key, value):
"""Set a configuration value for a given key."""
self.config[key] = value


Expand Down Expand Up @@ -81,7 +87,7 @@ def set_config(self, key, value):
)
@click.pass_context
def cli(ctx, *args, **kwargs):
"""pypdf_table_extraction: PDF Table Extraction for Humans"""
"""pypdf_table_extraction: PDF Table Extraction for Humans."""
ctx.obj = Config()
for key, value in kwargs.items():
ctx.obj.set_config(key, value)
Expand Down Expand Up @@ -325,3 +331,185 @@ def stream(c, *args, **kwargs):
plt.show()
else:
tables.export(output, f=f, compress=compress)


@cli.command("hybrid")
@click.option(
"-R",
"--table_regions",
default=[],
multiple=True,
help="Page regions to analyze. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-T",
"--table_areas",
default=[],
multiple=True,
help="Table areas to process. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-C",
"--columns",
default=[],
multiple=True,
help="X coordinates of column separators.",
)
@click.option(
"-e",
"--edge_tol",
default=50,
help="Tolerance parameter" " for extending textedges vertically.",
)
@click.option(
"-r",
"--row_tol",
default=2,
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
)
@click.option(
"-c",
"--column_tol",
default=0,
help="Tolerance parameter"
" used to combine text horizontally, to generate columns.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "textedge"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def hybrid(c, *args, **kwargs):
"""Use spaces between text to parse the table."""
conf = c.config
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
quiet = conf.pop("quiet")
plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop("filepath")
kwargs.update(conf)

table_regions = list(kwargs["table_regions"])
kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs["table_areas"])
kwargs["table_areas"] = None if not table_areas else table_areas
columns = list(kwargs["columns"])
kwargs["columns"] = None if not columns else columns

if plot_type is not None:
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError("Please specify output file path using --output")
if f is None:
raise click.UsageError("Please specify output file format using --format")

tables = read_pdf(
filepath, pages=pages, flavor="hybrid", suppress_stdout=quiet, **kwargs
)
click.echo(f"Found {tables.n} tables")
if plot_type is not None:
for table in tables:
plot(table, kind=plot_type)
plt.show()
else:
tables.export(output, f=f, compress=compress)


@cli.command("network")
@click.option(
"-R",
"--table_regions",
default=[],
multiple=True,
help="Page regions to analyze. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-T",
"--table_areas",
default=[],
multiple=True,
help="Table areas to process. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-C",
"--columns",
default=[],
multiple=True,
help="X coordinates of column separators.",
)
@click.option(
"-e",
"--edge_tol",
default=50,
help="Tolerance parameter" " for extending textedges vertically.",
)
@click.option(
"-r",
"--row_tol",
default=2,
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
)
@click.option(
"-c",
"--column_tol",
default=0,
help="Tolerance parameter"
" used to combine text horizontally, to generate columns.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "textedge"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def network(c, *args, **kwargs):
"""Use spaces between text to parse the table."""
conf = c.config
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
quiet = conf.pop("quiet")
plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop("filepath")
kwargs.update(conf)

table_regions = list(kwargs["table_regions"])
kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs["table_areas"])
kwargs["table_areas"] = None if not table_areas else table_areas
columns = list(kwargs["columns"])
kwargs["columns"] = None if not columns else columns

if plot_type is not None:
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError("Please specify output file path using --output")
if f is None:
raise click.UsageError("Please specify output file format using --format")

tables = read_pdf(
filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs
)
click.echo(f"Found {tables.n} tables")
if plot_type is not None:
for table in tables:
plot(table, kind=plot_type)
plt.show()
else:
tables.export(output, f=f, compress=compress)
Loading