Skip to content

Commit aed96cb

Browse files
authored
Clearer variables for image cropping (#2298)
1 parent 46b49b0 commit aed96cb

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

app/backend/prepdocslib/pdfparser.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -224,25 +224,25 @@ def table_to_html(table: DocumentTable):
224224

225225
@staticmethod
226226
def crop_image_from_pdf_page(
227-
doc: pymupdf.Document, page_number: int, bounding_box: tuple[float, float, float, float]
227+
doc: pymupdf.Document, page_number: int, bbox_inches: tuple[float, float, float, float]
228228
) -> bytes:
229229
"""
230230
Crops a region from a given page in a PDF and returns it as an image.
231231
232232
:param pdf_path: Path to the PDF file.
233233
:param page_number: The page number to crop from (0-indexed).
234-
:param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
234+
:param bbox_inches: A tuple of (x0, y0, x1, y1) coordinates for the bounding box, in inches.
235235
:return: A PIL Image of the cropped area.
236236
"""
237+
# Scale the bounding box to 72 DPI
238+
bbox_dpi = 72
239+
bbox_pixels = [x * bbox_dpi for x in bbox_inches]
240+
rect = pymupdf.Rect(bbox_pixels)
241+
# Assume that the PDF has 300 DPI,
242+
# and use the matrix to convert between the 2 DPIs
243+
page_dpi = 300
237244
page = doc.load_page(page_number)
238-
239-
# Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
240-
bbx = [x * 72 for x in bounding_box]
241-
rect = pymupdf.Rect(bbx)
242-
# Bounding box is scaled to 72 dots per inch
243-
# We assume the PDF has 300 DPI
244-
# The matrix is used to convert between these 2 units
245-
pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect)
245+
pix = page.get_pixmap(matrix=pymupdf.Matrix(page_dpi / bbox_dpi, page_dpi / bbox_dpi), clip=rect)
246246

247247
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
248248
bytes_io = io.BytesIO()

0 commit comments

Comments
 (0)