@@ -224,25 +224,25 @@ def table_to_html(table: DocumentTable):
224
224
225
225
@staticmethod
226
226
def crop_image_from_pdf_page (
227
- doc : pymupdf .Document , page_number : int , bounding_box : tuple [float , float , float , float ]
227
+ doc : pymupdf .Document , page_number : int , bbox_inches : tuple [float , float , float , float ]
228
228
) -> bytes :
229
229
"""
230
230
Crops a region from a given page in a PDF and returns it as an image.
231
231
232
232
:param pdf_path: Path to the PDF file.
233
233
:param page_number: The page number to crop from (0-indexed).
234
- :param bounding_box : A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
234
+ :param bbox_inches : A tuple of (x0, y0, x1, y1) coordinates for the bounding box, in inches .
235
235
:return: A PIL Image of the cropped area.
236
236
"""
237
+ # Scale the bounding box to 72 DPI
238
+ bbox_dpi = 72
239
+ bbox_pixels = [x * bbox_dpi for x in bbox_inches ]
240
+ rect = pymupdf .Rect (bbox_pixels )
241
+ # Assume that the PDF has 300 DPI,
242
+ # and use the matrix to convert between the 2 DPIs
243
+ page_dpi = 300
237
244
page = doc .load_page (page_number )
238
-
239
- # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
240
- bbx = [x * 72 for x in bounding_box ]
241
- rect = pymupdf .Rect (bbx )
242
- # Bounding box is scaled to 72 dots per inch
243
- # We assume the PDF has 300 DPI
244
- # The matrix is used to convert between these 2 units
245
- pix = page .get_pixmap (matrix = pymupdf .Matrix (300 / 72 , 300 / 72 ), clip = rect )
245
+ pix = page .get_pixmap (matrix = pymupdf .Matrix (page_dpi / bbox_dpi , page_dpi / bbox_dpi ), clip = rect )
246
246
247
247
img = Image .frombytes ("RGB" , (pix .width , pix .height ), pix .samples )
248
248
bytes_io = io .BytesIO ()
0 commit comments