Skip to content
This repository was archived by the owner on Apr 2, 2025. It is now read-only.
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
64765fd
Refactor base classes and improve plotting
Apr 19, 2020
9d86980
Fixup Pre-commit
bosd Sep 21, 2024
9353dce
Fixup Pre-commit utils.py
bosd Sep 21, 2024
2e1e45b
Disable plot tests on windows
bosd Sep 21, 2024
7bd4c9e
[IMP] Transparency on contourplot
bosd Sep 21, 2024
f97c731
Initial Hybrid parser, for now identical to Stream
Apr 19, 2020
543aae1
Refactor hybrid parser tests to own file
bosd Sep 21, 2024
b4e3aec
'Merge' parsers from 'hybrid-parser' branch
bosd Sep 21, 2024
6f2fd7d
Add network parser to cli (#38)
bosd Sep 21, 2024
0c8b0bc
Add debug mode and support for neew parsers
bosd Sep 21, 2024
3ef91c4
Handlers add support for network and hybrid parsers, todo: debug support
bosd Sep 21, 2024
3764b74
Further changes to support new parsers
bosd Sep 22, 2024
3bd72b2
Refactor core.py to support textedges and alignments
bosd Sep 22, 2024
c800863
add generic function get_index_closest_point to utils
bosd Sep 22, 2024
a72c350
Update utls.py
bosd Sep 22, 2024
c9537ad
Refactor/move copy_spanning_text to core
bosd Sep 22, 2024
b1e21b5
Finalize the base class
bosd Sep 22, 2024
fadeda7
Refactor to use prepare_page_parse
bosd Sep 22, 2024
ed520bf
Lattice Fix rename imagename to pdf_image
bosd Sep 22, 2024
61ff62d
Further refactor lattice parser
bosd Sep 22, 2024
92838f4
Fixup base parser/ improvement from fork
bosd Sep 22, 2024
7386989
Refactor Stream Parser
bosd Sep 22, 2024
10a6940
Plotting.py support for axes
bosd Sep 23, 2024
4856cd2
Update 'test_unknown_flavor' to support new parsers
bosd Sep 23, 2024
44b8d11
data.py - attempt fixup vertical_header test, did not work
bosd Sep 23, 2024
cd35706
add test_cli network parser
bosd Sep 25, 2024
b2af275
multiple table support
bosd Sep 28, 2024
a720762
Fixup network_table_search
bosd Sep 28, 2024
2e073fd
leftovers in core.py should fix network parser
bosd Sep 28, 2024
915bd2b
imp network debug plot
bosd Sep 28, 2024
bd6ef52
imp network table search green line
bosd Sep 28, 2024
228f7b3
intermediate tests hybrid
bosd Sep 29, 2024
9c8ee92
IMP utils to support kwargs
bosd Sep 29, 2024
99207a1
Fixup Hybrid table rotated test
bosd Sep 29, 2024
1daa56e
Unfinished attemt to fix cli plot lattice
bosd Sep 29, 2024
91ccf38
[IMP] Plotting tests
bosd Sep 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 195 additions & 4 deletions camelot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,13 +290,204 @@ def stream(c, *args, **kwargs):
columns = list(kwargs["columns"])
kwargs["columns"] = None if not columns else columns

margins = conf.pop('margins')
margins = conf.pop("margins")

if margins is None:
layout_kwargs = {}
else:
layout_kwargs = {"char_margin": margins[0], "line_margin": margins[1], "word_margin": margins[2]}

layout_kwargs = {
"char_margin": margins[0],
"line_margin": margins[1],
"word_margin": margins[2],
}

if plot_type is not None:
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError("Please specify output file path using --output")
if f is None:
raise click.UsageError("Please specify output file format using --format")

tables = read_pdf(
filepath,
pages=pages,
flavor="stream",
suppress_stdout=quiet,
layout_kwargs=layout_kwargs,
**kwargs,
)
click.echo(f"Found {tables.n} tables")
if plot_type is not None:
for table in tables:
plot(table, kind=plot_type)
plt.show()
else:
tables.export(output, f=f, compress=compress)


@cli.command("hybrid")
@click.option(
"-R",
"--table_regions",
default=[],
multiple=True,
help="Page regions to analyze. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-T",
"--table_areas",
default=[],
multiple=True,
help="Table areas to process. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-C",
"--columns",
default=[],
multiple=True,
help="X coordinates of column separators.",
)
@click.option(
"-e",
"--edge_tol",
default=50,
help="Tolerance parameter" " for extending textedges vertically.",
)
@click.option(
"-r",
"--row_tol",
default=2,
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
)
@click.option(
"-c",
"--column_tol",
default=0,
help="Tolerance parameter"
" used to combine text horizontally, to generate columns.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "textedge"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def hybrid(c, *args, **kwargs):
"""Use spaces between text to parse the table."""
conf = c.config
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
quiet = conf.pop("quiet")
plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop("filepath")
kwargs.update(conf)

table_regions = list(kwargs["table_regions"])
kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs["table_areas"])
kwargs["table_areas"] = None if not table_areas else table_areas
columns = list(kwargs["columns"])
kwargs["columns"] = None if not columns else columns

if plot_type is not None:
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError("Please specify output file path using --output")
if f is None:
raise click.UsageError("Please specify output file format using --format")

tables = read_pdf(
filepath, pages=pages, flavor="hybrid", suppress_stdout=quiet, **kwargs
)
click.echo(f"Found {tables.n} tables")
if plot_type is not None:
for table in tables:
plot(table, kind=plot_type)
plt.show()
else:
tables.export(output, f=f, compress=compress)


@cli.command("network")
@click.option(
"-R",
"--table_regions",
default=[],
multiple=True,
help="Page regions to analyze. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-T",
"--table_areas",
default=[],
multiple=True,
help="Table areas to process. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-C",
"--columns",
default=[],
multiple=True,
help="X coordinates of column separators.",
)
@click.option(
"-e",
"--edge_tol",
default=50,
help="Tolerance parameter" " for extending textedges vertically.",
)
@click.option(
"-r",
"--row_tol",
default=2,
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
)
@click.option(
"-c",
"--column_tol",
default=0,
help="Tolerance parameter"
" used to combine text horizontally, to generate columns.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "textedge"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def network(c, *args, **kwargs):
"""Use spaces between text to parse the table."""
conf = c.config
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
quiet = conf.pop("quiet")
plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop("filepath")
kwargs.update(conf)

table_regions = list(kwargs["table_regions"])
kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs["table_areas"])
kwargs["table_areas"] = None if not table_areas else table_areas
columns = list(kwargs["columns"])
kwargs["columns"] = None if not columns else columns

if plot_type is not None:
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
Expand All @@ -307,7 +498,7 @@ def stream(c, *args, **kwargs):
raise click.UsageError("Please specify output file format using --format")

tables = read_pdf(
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, layout_kwargs=layout_kwargs, **kwargs
filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs
)
click.echo(f"Found {tables.n} tables")
if plot_type is not None:
Expand Down
Loading