Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
e350b05
feat: replace pdfminer with paves.miner
Aug 1, 2025
f418442
deps: update deps
Aug 1, 2025
c2cb2f9
feat!: remove dependency on pypdf by not extracting pages
Aug 1, 2025
773a06e
fix: nope gotta use those laparams
Aug 1, 2025
cb85d80
chore: foo
Aug 1, 2025
987c92a
fix: support parallel like before
Aug 1, 2025
41dc89d
fix: make error messages match
Aug 1, 2025
43e0a0e
fix: raise from
Aug 1, 2025
66971d5
fix!: allow converting other than page 1
Aug 1, 2025
c8b612a
fix: update error message
Aug 1, 2025
61e3130
fix: rely on new playa that lets you set rotation
Aug 1, 2025
afd8acb
fix: apply rotation in image processing and plotting
Aug 2, 2025
b32ce33
chore: lock
Aug 2, 2025
0608dce
fix: apply rotation to threshold too
Aug 2, 2025
b7fc7a3
fix(tests): ensure ultimate error message compatibility
Aug 2, 2025
92ff79f
fix(types): fix types
Aug 2, 2025
a994198
chore: isort
Aug 2, 2025
a58fa0f
chore: blacken
Aug 2, 2025
b5e04b3
revert: go back to master for ntoebook
Aug 2, 2025
89e4fcf
fix: is_extractable works in strange ways...
Aug 2, 2025
3ce32a9
Revert "fix: is_extractable works in strange ways..."
Aug 2, 2025
f824257
fix(tests): verify that no-extraction is respected
Aug 2, 2025
065e83e
fix: remove unused import
dhdaines Aug 16, 2025
c6371e0
fix(types): remove test that cannot possibly work and has bad types
dhdaines Aug 16, 2025
dc35e24
fix: render the correct page and do not save images in lattice parser
dhdaines Aug 17, 2025
64e5b50
fix(deps): restore python 3.8 compatibility with latest playa
Aug 17, 2025
4a54f0d
fix(tests): add setuptools dependency to hopefully fix py3.8 tests
Aug 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion camelot/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ def installed(self) -> bool: # noqa D102
raise NotImplementedError

def convert( # noqa D102
self, pdf_path: str, png_path: str, resolution: int = 300
self, pdf_path: str, png_path: str, resolution: int = 300, page: int = 1
) -> None: # noqa D102
raise NotImplementedError
8 changes: 7 additions & 1 deletion camelot/backends/ghostscript_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
class GhostscriptBackend(ConversionBackend):
"""Classmethod to create GhostscriptScriptBackend."""

def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
def convert(
self, pdf_path: str, png_path: str, resolution: int = 300, page: int = 1
) -> None:
"""Convert a PDF to a PNG image using Ghostscript .

Parameters
Expand All @@ -17,6 +19,8 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
[description]
resolution : int, optional
[description], by default 300
page: int, optional
Single page to convert.

Raises
------
Expand All @@ -35,6 +39,8 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
"gs",
"-q",
"-sDEVICE=png16m",
f"-dFirstPage={page}",
f"-dLastPage={page}",
"-o",
png_path,
f"-r{resolution}",
Expand Down
8 changes: 5 additions & 3 deletions camelot/backends/image_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def implements_convert():

return backend

def convert(self, pdf_path: str, png_path: str) -> None:
def convert(self, pdf_path: str, png_path: str, page: int = 1) -> None:
"""Convert PDF to png_path.

Parameters
Expand All @@ -113,6 +113,8 @@ def convert(self, pdf_path: str, png_path: str) -> None:
Path where to read the pdf file.
png_path : str
Path where to save png file.
page: int, optional
Single page to convert.

Raises
------
Expand All @@ -122,13 +124,13 @@ def convert(self, pdf_path: str, png_path: str) -> None:
[description]
"""
try:
self.backend.convert(pdf_path, png_path)
self.backend.convert(pdf_path, png_path, page=page)
except Exception as f:
if self.use_fallback:
for fallback in self.fallbacks:
try:
converter = BACKENDS[fallback]()
converter.convert(pdf_path, png_path)
converter.convert(pdf_path, png_path, page=page)
except Exception as e:
msg = f"Image conversion failed with image conversion backend {fallback!r}\n error: {e}"
raise ImageConversionError(msg) from e
Expand Down
8 changes: 6 additions & 2 deletions camelot/backends/pdfium_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def installed(self) -> bool: # noqa D102
return True
return False

def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
def convert(
self, pdf_path: str, png_path: str, resolution: int = 300, page: int = 1
) -> None:
"""Convert PDF to png.

Parameters
Expand All @@ -29,6 +31,8 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
Path where to read the pdf file.
png_path : str
Path where to save png file.
page: int, optional
Single page to convert.

Raises
------
Expand All @@ -39,5 +43,5 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
raise OSError(f"pypdfium2 is not available: {PDFIUM_EXC!r}")
doc = pdfium.PdfDocument(pdf_path)
doc.init_forms()
image = doc[0].render(scale=resolution / 72).to_pil()
image = doc[page - 1].render(scale=resolution / 72).to_pil()
image.save(png_path)
23 changes: 19 additions & 4 deletions camelot/backends/poppler_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
class PopplerBackend(ConversionBackend):
"""Classmethod to create a poplerBackendBackend class."""

def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
def convert(
self, pdf_path: str, png_path: str, resolution: int = 300, page: int = 1
) -> None:
"""Convert PDF to png.

Parameters
Expand All @@ -31,6 +33,8 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
Path where to read the pdf file.
png_path : str
Path where to save png file.
page: int, optional
Single page to convert.

Raises
------
Expand All @@ -39,13 +43,24 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
ValueError
[description]
"""
pdftopng_executable = shutil.which("pdftopng", path=path)
pdftopng_executable = shutil.which("pdftocairo", path=path)
if pdftopng_executable is None:
raise OSError(
"pdftopng is not installed. You can install it using the 'pip install pdftopng' command."
"pdftocairo is not installed. Please install `poppler-utils`."
)

pdftopng_command = [pdftopng_executable, pdf_path, png_path]
png_stem, _ = os.path.splitext(png_path)
pdftopng_command = [
pdftopng_executable,
"-png",
"-singlefile",
"-f",
str(page),
"-l",
str(page),
pdf_path,
png_stem,
]

try:
subprocess.check_output(
Expand Down
6 changes: 4 additions & 2 deletions camelot/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from typing_extensions import TypedDict, Unpack

from .backends import ImageConversionBackend
from .image_processing import undo_rotation
from .utils import build_file_path_in_temp_dir
from .utils import get_index_closest_point
from .utils import get_textline_coords
Expand Down Expand Up @@ -548,6 +549,7 @@ def __init__(self, cols, rows):
self.filename = None
self.order = None
self.page = None
self.rotation = ""
self.flavor = None # Flavor of the parser that generated the table
self.pdf_size = None # Dimensions of the original PDF page
self._bbox = None # Bounding box in original document
Expand Down Expand Up @@ -618,8 +620,8 @@ def get_pdf_image(self):
os.path.basename(self.filename), ".png"
)
backend = ImageConversionBackend(use_fallback=True)
backend.convert(self.filename, self._image_path)
self._image = cv2.imread(self._image_path)
backend.convert(self.filename, self._image_path, page=self.page)
self._image = undo_rotation(cv2.imread(self._image_path), self.rotation)
return self._image

def set_all_edges(self):
Expand Down
Loading