Clearer variables for image cropping (#2298)

pamelafox · web-flow · commit aed96cb68f91 · 2025-01-28T16:54:05.000-08:00
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
@@ -224,25 +224,25 @@ def table_to_html(table: DocumentTable):
 
     @staticmethod
     def crop_image_from_pdf_page(
-        doc: pymupdf.Document, page_number: int, bounding_box: tuple[float, float, float, float]
+        doc: pymupdf.Document, page_number: int, bbox_inches: tuple[float, float, float, float]
     ) -> bytes:
         """
         Crops a region from a given page in a PDF and returns it as an image.
 
         :param pdf_path: Path to the PDF file.
         :param page_number: The page number to crop from (0-indexed).
-        :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
+        :param bbox_inches: A tuple of (x0, y0, x1, y1) coordinates for the bounding box, in inches.
         :return: A PIL Image of the cropped area.
         """
+        # Scale the bounding box to 72 DPI
+        bbox_dpi = 72
+        bbox_pixels = [x * bbox_dpi for x in bbox_inches]
+        rect = pymupdf.Rect(bbox_pixels)
+        # Assume that the PDF has 300 DPI,
+        # and use the matrix to convert between the 2 DPIs
+        page_dpi = 300
         page = doc.load_page(page_number)
-
-        # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
-        bbx = [x * 72 for x in bounding_box]
-        rect = pymupdf.Rect(bbx)
-        # Bounding box is scaled to 72 dots per inch
-        # We assume the PDF has 300 DPI
-        # The matrix is used to convert between these 2 units
-        pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect)
+        pix = page.get_pixmap(matrix=pymupdf.Matrix(page_dpi / bbox_dpi, page_dpi / bbox_dpi), clip=rect)
 
         img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
         bytes_io = io.BytesIO()