diff --git a/camelot/cli.py b/camelot/cli.py index 79bbe354..18e9a1aa 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -52,6 +52,15 @@ def set_config(self, key, value): @click.group(name="camelot") @click.version_option(version=__version__) +@click.pass_context +def cli(ctx, *args, **kwargs): + """Camelot: PDF Table Extraction for Humans.""" + ctx.obj = Config() + for key, value in kwargs.items(): + ctx.obj.set_config(key, value) + + +@cli.command("lattice") @click.option( "-q", "--quiet", is_flag=False, default=False, help="Suppress logs and warnings." ) @@ -101,15 +110,6 @@ def set_config(self, key, value): default=(1.0, 0.5, 0.1), help="PDFMiner char_margin, line_margin and word_margin.", ) -@click.pass_context -def cli(ctx, *args, **kwargs): - """Camelot: PDF Table Extraction for Humans.""" - ctx.obj = Config() - for key, value in kwargs.items(): - ctx.obj.set_config(key, value) - - -@cli.command("lattice") @click.option( "-R", "--table_regions", @@ -203,15 +203,13 @@ def cli(ctx, *args, **kwargs): @pass_config def lattice(c, *args, **kwargs): """Use lines between text to parse the table.""" - conf = c.config - pages = conf.pop("pages") - output = conf.pop("output") - f = conf.pop("format") - compress = conf.pop("zip") - quiet = conf.pop("quiet") + pages = kwargs.pop("pages") + output = kwargs.pop("output") + f = kwargs.pop("format") + compress = kwargs.pop("zip") + quiet = kwargs.pop("quiet") plot_type = kwargs.pop("plot_type") filepath = kwargs.pop("filepath") - kwargs.update(conf) table_regions = list(kwargs["table_regions"]) kwargs["table_regions"] = None if not table_regions else table_regions @@ -243,6 +241,55 @@ def lattice(c, *args, **kwargs): @cli.command("stream") +@click.option( + "-q", "--quiet", is_flag=False, default=False, help="Suppress logs and warnings." +) +@click.option( + "-p", + "--pages", + default="1", + help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.", +) +@click.option( + "--parallel", + is_flag=True, + default=False, + help="Read pdf pages in parallel using all CPU cores.", +) +@click.option("-pw", "--password", help="Password for decryption.") +@click.option("-o", "--output", help="Output file path.") +@click.option( + "-f", + "--format", + type=click.Choice(["csv", "excel", "html", "json", "markdown", "sqlite"]), + help="Output file format.", +) +@click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.") +@click.option( + "-split", + "--split_text", + is_flag=True, + help="Split text that spans across multiple cells.", +) +@click.option( + "-flag", + "--flag_size", + is_flag=True, + help="Flag text based on" " font size. Useful to detect super/subscripts.", +) +@click.option( + "-strip", + "--strip_text", + help="Characters that should be stripped from a string before" + " assigning it to a cell.", +) +@click.option( + "-M", + "--margins", + nargs=3, + default=(1.0, 0.5, 0.1), + help="PDFMiner char_margin, line_margin and word_margin.", +) @click.option( "-R", "--table_regions", @@ -295,15 +342,13 @@ def lattice(c, *args, **kwargs): @pass_config def stream(c, *args, **kwargs): """Use spaces between text to parse the table.""" - conf = c.config - pages = conf.pop("pages") - output = conf.pop("output") - f = conf.pop("format") - compress = conf.pop("zip") - quiet = conf.pop("quiet") + pages = kwargs.pop("pages") + output = kwargs.pop("output") + f = kwargs.pop("format") + compress = kwargs.pop("zip") + quiet = kwargs.pop("quiet") plot_type = kwargs.pop("plot_type") filepath = kwargs.pop("filepath") - kwargs.update(conf) table_regions = list(kwargs["table_regions"]) kwargs["table_regions"] = None if not table_regions else table_regions @@ -312,7 +357,7 @@ def stream(c, *args, **kwargs): columns = list(kwargs["columns"]) kwargs["columns"] = None if not columns else columns - margins = conf.pop("margins") + margins = kwargs.pop("margins") if margins is None: layout_kwargs = {} @@ -350,6 +395,55 @@ def stream(c, *args, **kwargs): @cli.command("hybrid") +@click.option( + "-q", "--quiet", is_flag=False, default=False, help="Suppress logs and warnings." +) +@click.option( + "-p", + "--pages", + default="1", + help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.", +) +@click.option( + "--parallel", + is_flag=True, + default=False, + help="Read pdf pages in parallel using all CPU cores.", +) +@click.option("-pw", "--password", help="Password for decryption.") +@click.option("-o", "--output", help="Output file path.") +@click.option( + "-f", + "--format", + type=click.Choice(["csv", "excel", "html", "json", "markdown", "sqlite"]), + help="Output file format.", +) +@click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.") +@click.option( + "-split", + "--split_text", + is_flag=True, + help="Split text that spans across multiple cells.", +) +@click.option( + "-flag", + "--flag_size", + is_flag=True, + help="Flag text based on" " font size. Useful to detect super/subscripts.", +) +@click.option( + "-strip", + "--strip_text", + help="Characters that should be stripped from a string before" + " assigning it to a cell.", +) +@click.option( + "-M", + "--margins", + nargs=3, + default=(1.0, 0.5, 0.1), + help="PDFMiner char_margin, line_margin and word_margin.", +) @click.option( "-R", "--table_regions", @@ -402,15 +496,13 @@ def stream(c, *args, **kwargs): @pass_config def hybrid(c, *args, **kwargs): """Combines the strengths of both the Network and the Lattice parser.""" - conf = c.config - pages = conf.pop("pages") - output = conf.pop("output") - f = conf.pop("format") - compress = conf.pop("zip") - quiet = conf.pop("quiet") + pages = kwargs.pop("pages") + output = kwargs.pop("output") + f = kwargs.pop("format") + compress = kwargs.pop("zip") + quiet = kwargs.pop("quiet") plot_type = kwargs.pop("plot_type") filepath = kwargs.pop("filepath") - kwargs.update(conf) table_regions = list(kwargs["table_regions"]) kwargs["table_regions"] = None if not table_regions else table_regions @@ -441,6 +533,55 @@ def hybrid(c, *args, **kwargs): @cli.command("network") +@click.option( + "-q", "--quiet", is_flag=False, default=False, help="Suppress logs and warnings." +) +@click.option( + "-p", + "--pages", + default="1", + help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.", +) +@click.option( + "--parallel", + is_flag=True, + default=False, + help="Read pdf pages in parallel using all CPU cores.", +) +@click.option("-pw", "--password", help="Password for decryption.") +@click.option("-o", "--output", help="Output file path.") +@click.option( + "-f", + "--format", + type=click.Choice(["csv", "excel", "html", "json", "markdown", "sqlite"]), + help="Output file format.", +) +@click.option("-z", "--zip", is_flag=True, help="Create ZIP archive.") +@click.option( + "-split", + "--split_text", + is_flag=True, + help="Split text that spans across multiple cells.", +) +@click.option( + "-flag", + "--flag_size", + is_flag=True, + help="Flag text based on" " font size. Useful to detect super/subscripts.", +) +@click.option( + "-strip", + "--strip_text", + help="Characters that should be stripped from a string before" + " assigning it to a cell.", +) +@click.option( + "-M", + "--margins", + nargs=3, + default=(1.0, 0.5, 0.1), + help="PDFMiner char_margin, line_margin and word_margin.", +) @click.option( "-R", "--table_regions", @@ -493,15 +634,13 @@ def hybrid(c, *args, **kwargs): @pass_config def network(c, *args, **kwargs): """Use text alignments to parse the table.""" - conf = c.config - pages = conf.pop("pages") - output = conf.pop("output") - f = conf.pop("format") - compress = conf.pop("zip") - quiet = conf.pop("quiet") + pages = kwargs.pop("pages") + output = kwargs.pop("output") + f = kwargs.pop("format") + compress = kwargs.pop("zip") + quiet = kwargs.pop("quiet") plot_type = kwargs.pop("plot_type") filepath = kwargs.pop("filepath") - kwargs.update(conf) table_regions = list(kwargs["table_regions"]) kwargs["table_regions"] = None if not table_regions else table_regions diff --git a/tests/test_cli.py b/tests/test_cli.py index a3801152..74e39a1d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -32,16 +32,16 @@ def test_cli_lattice(testdir): outfile = os.path.join(tempdir, "foo.csv") runner = CliRunner() result = runner.invoke( - cli, ["--format", "csv", "--output", outfile, "lattice", infile] + cli, ["lattice", "--format", "csv", "--output", outfile, infile] ) assert result.exit_code == 0 assert "Found 1 tables" in result.output - result = runner.invoke(cli, ["--format", "csv", "lattice", infile]) + result = runner.invoke(cli, ["lattice", "--format", "csv", infile]) output_error = "Error: Please specify output file path using --output" assert output_error in result.output - result = runner.invoke(cli, ["--output", outfile, "lattice", infile]) + result = runner.invoke(cli, ["lattice", "--output", outfile, infile]) format_error = "Please specify output file format using --format" assert format_error in result.output @@ -52,22 +52,23 @@ def test_cli_stream(testdir): outfile = os.path.join(tempdir, "budget.csv") runner = CliRunner() result = runner.invoke( - cli, ["--format", "csv", "--output", outfile, "stream", infile] + cli, ["stream", "--format", "csv", "--output", outfile, infile] ) assert result.exit_code == 0 assert result.output == "Found 1 tables\n" - result = runner.invoke(cli, ["--format", "csv", "stream", infile]) + result = runner.invoke(cli, ["stream", "--format", "csv", infile]) output_error = "Error: Please specify output file path using --output" assert output_error in result.output - result = runner.invoke(cli, ["--output", outfile, "stream", infile]) + result = runner.invoke(cli, ["stream", "--output", outfile, infile]) format_error = "Please specify output file format using --format" assert format_error in result.output result = runner.invoke( cli, [ + "stream", "--margins", "1.5", "0.5", @@ -76,7 +77,6 @@ def test_cli_stream(testdir): "csv", "--output", outfile, - "stream", infile, ], ) @@ -86,6 +86,7 @@ def test_cli_stream(testdir): result = runner.invoke( cli, [ + "stream", "--margins", "1.5", "0.5", @@ -93,7 +94,6 @@ def test_cli_stream(testdir): "csv", "--output", outfile, - "stream", infile, ], ) @@ -110,6 +110,7 @@ def test_cli_parallel(testdir): result = runner.invoke( cli, [ + "lattice", "--parallel", "--pages", "1,2,3", @@ -117,7 +118,6 @@ def test_cli_parallel(testdir): "csv", "--output", outfile, - "lattice", infile, ], ) @@ -131,16 +131,16 @@ def test_cli_hybrid(testdir): outfile = os.path.join(tempdir, "budget.csv") runner = CliRunner() result = runner.invoke( - cli, ["--format", "csv", "--output", outfile, "hybrid", infile] + cli, ["hybrid", "--format", "csv", "--output", outfile, infile] ) assert result.exit_code == 0 assert result.output == "Found 1 tables\n" - result = runner.invoke(cli, ["--format", "csv", "hybrid", infile]) + result = runner.invoke(cli, ["hybrid", "--format", "csv", infile]) output_error = "Error: Please specify output file path using --output" assert output_error in result.output - result = runner.invoke(cli, ["--output", outfile, "hybrid", infile]) + result = runner.invoke(cli, ["hybrid", "--output", outfile, infile]) format_error = "Please specify output file format using --format" assert format_error in result.output @@ -151,14 +151,14 @@ def test_cli_network(testdir): outfile = os.path.join(tempdir, "budget.csv") runner = CliRunner() result = runner.invoke( - cli, ["--format", "csv", "--output", outfile, "network", infile] + cli, ["network", "--format", "csv", "--output", outfile, infile] ) assert result.exit_code == 0 assert result.output == "Found 1 tables\n" - result = runner.invoke(cli, ["--format", "csv", "network", infile]) + result = runner.invoke(cli, ["network", "--format", "csv", infile]) output_error = "Error: Please specify output file path using --output" assert output_error in result.output - result = runner.invoke(cli, ["--output", outfile, "network", infile]) + result = runner.invoke(cli, ["network", "--output", outfile, infile]) format_error = "Please specify output file format using --format" assert format_error in result.output @@ -171,13 +171,13 @@ def test_cli_password(testdir): result = runner.invoke( cli, [ + "stream", "--password", "userpass", "--format", "csv", "--output", outfile, - "stream", infile, ], ) @@ -187,7 +187,7 @@ def test_cli_password(testdir): output_error = "File has not been decrypted" # no password result = runner.invoke( - cli, ["--format", "csv", "--output", outfile, "stream", infile] + cli, ["stream", "--format", "csv", "--output", outfile, infile] ) assert output_error in str(result.exception) @@ -195,13 +195,13 @@ def test_cli_password(testdir): result = runner.invoke( cli, [ + "stream", "--password", "wrongpass", "--format", "csv", "--output", outfile, - "stream", infile, ], ) @@ -218,7 +218,7 @@ def test_cli_output_format(testdir): outfile = os.path.join(tempdir, "health.json") result = runner.invoke( cli, - ["--format", "json", "--output", outfile, "stream", infile], + ["stream", "--format", "json", "--output", outfile, infile], ) assert result.exit_code == 0, f"Output: {result.output}" @@ -226,7 +226,7 @@ def test_cli_output_format(testdir): outfile = os.path.join(tempdir, "health.xlsx") result = runner.invoke( cli, - ["--format", "excel", "--output", outfile, "stream", infile], + ["stream", "--format", "excel", "--output", outfile, infile], ) assert result.exit_code == 0, f"Output: {result.output}" @@ -234,7 +234,7 @@ def test_cli_output_format(testdir): outfile = os.path.join(tempdir, "health.html") result = runner.invoke( cli, - ["--format", "html", "--output", outfile, "stream", infile], + ["stream", "--format", "html", "--output", outfile, infile], ) assert result.exit_code == 0, f"Output: {result.output}" @@ -242,7 +242,7 @@ def test_cli_output_format(testdir): outfile = os.path.join(tempdir, "health.md") result = runner.invoke( cli, - ["--format", "markdown", "--output", outfile, "stream", infile], + ["stream", "--format", "markdown", "--output", outfile, infile], ) assert result.exit_code == 0, f"Output: {result.output}" @@ -251,12 +251,12 @@ def test_cli_output_format(testdir): result = runner.invoke( cli, [ + "stream", "--zip", "--format", "csv", "--output", outfile, - "stream", infile, ], ) @@ -275,12 +275,12 @@ def test_cli_quiet(testdir): result = runner.invoke( cli, [ + "stream", "--quiet", "--format", "csv", "--output", outfile, - "stream", infile, ], ) @@ -296,12 +296,12 @@ def test_cli_lattice_plot_type(): result = runner.invoke( cli, [ + "lattice", "--plot_type", "contour", "--output", outfile, "--format", - "--format", "png", ], )