88from pathlib import Path
99
1010import pymupdf
11+ import pytesseract
1112from joblib import Parallel , delayed
1213from natsort import natsorted , ns
1314from PIL import Image
@@ -24,12 +25,35 @@ def predict(base: Path, input_file: Path, args: list[str]) -> None:
2425 relative_path = input_file .relative_to (base / "todo" )
2526
2627 try :
28+ dpi = 0
2729 if not str (input_file ).lower ().endswith (".pdf" ):
2830 image = Image .open (input_file )
29- image .convert ("RGB" ).save (input_file , dpi = image .info .get ("dpi" , (300 , 300 )))
31+ if "dpi" in image .info :
32+ dpi = max (image .info ["dpi" ])
33+ else :
34+ data = pytesseract .image_to_data (
35+ image , output_type = pytesseract .Output .DICT
36+ )
37+ heights = [
38+ data ["height" ][i ]
39+ for i in range (len (data ["text" ]))
40+ if int (data ["conf" ][i ]) > 0
41+ ]
42+ if heights :
43+ median_height = sorted (heights )[len (heights ) // 2 ]
44+ dpi = int (72 * (image .height / median_height ))
45+ # if dpi does not make sense, set to 300
46+ if dpi < 50 or dpi > 1200 :
47+ dpi = 300
48+ image .convert ("RGB" ).save (input_file , dpi = image .info .get ("dpi" , (dpi , dpi )))
3049
3150 output_file = base / "done" / relative_path .with_suffix (".pdf" )
3251 output_file .parent .mkdir (exist_ok = True , parents = True )
52+
53+ # if --image-dpi is not set and dpi is known, add it
54+ if dpi > 0 and not any (arg .startswith ("--image-dpi" ) for arg in args ):
55+ args .extend (["--image-dpi" , str (dpi )])
56+
3357 subprocess .run (
3458 [
3559 "bash" ,
0 commit comments