Skip to content

Commit 36952a9

Browse files
committed
estimate dpi
1 parent e5814bd commit 36952a9

File tree

3 files changed

+27
-2
lines changed

3 files changed

+27
-2
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
---
1010

11-
[![build](https://github.com/ipitio/ocr-pdf/actions/workflows/publish.yml/badge.svg)](https://github.com/ipitio/ocr-pdf/actions/workflows/publish.yml) [![downloads](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fipitio.github.io%2Fbackage%2Fipitio%2Focr-pdf%2Focr-pdf.json&query=%24.downloads&logo=github&logoColor=959da5&labelColor=333a41&label=pulls)](https://github.com/ipitio/ocr-pdf/pkgs/container/ocr-pdf) [![size](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fipitio.github.io%2Fbackage%2Fipitio%2Focr-pdf%2Focr-pdf.json&query=%24.size&logo=github&logoColor=959da5&label=size&labelColor=333a41&color=indigo)](https://github.com/ipitio/backage/pkgs/container/backage) [![latest](https://img.shields.io/badge/dynamic/xml?url=https%3A%2F%2Fipitio.github.io%2Fbackage%2Fipitio%2Focr-pdf%2Focr-pdf.xml&query=%2Fbkg%2Fversion%5B.%2Flatest%5B.%3D%22true%22%5D%5D%2Ftags%5B.!%3D%22latest%22%5D&logo=github&logoColor=959da5&label=latest&labelColor=333a41&color=darkgreen)](https://github.com/ipitio/backage/pkgs/container/backage)
11+
[![downloads](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fipitio.github.io%2Fbackage%2Fipitio%2Focr-pdf%2Focr-pdf.json&query=%24.downloads&logo=github&logoColor=959da5&labelColor=333a41&label=pulls)](https://github.com/ipitio/ocr-pdf/pkgs/container/ocr-pdf) [![size](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fipitio.github.io%2Fbackage%2Fipitio%2Focr-pdf%2Focr-pdf.json&query=%24.size&logo=github&logoColor=959da5&label=size&labelColor=333a41&color=indigo)](https://github.com/ipitio/backage/pkgs/container/backage) [![latest](https://img.shields.io/badge/dynamic/xml?url=https%3A%2F%2Fipitio.github.io%2Fbackage%2Fipitio%2Focr-pdf%2Focr-pdf.xml&query=%2Fxml%2Fversion%5B.%2Flatest%5B.%3D%22true%22%5D%5D%2Ftags%5B.!%3D%22latest%22%5D&logo=github&logoColor=959da5&label=latest&labelColor=333a41&color=darkgreen)](https://github.com/ipitio/backage/pkgs/container/backage)
1212

1313
</div>
1414

src/main.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from pathlib import Path
99

1010
import pymupdf
11+
import pytesseract
1112
from joblib import Parallel, delayed
1213
from natsort import natsorted, ns
1314
from PIL import Image
@@ -24,12 +25,35 @@ def predict(base: Path, input_file: Path, args: list[str]) -> None:
2425
relative_path = input_file.relative_to(base / "todo")
2526

2627
try:
28+
dpi = 0
2729
if not str(input_file).lower().endswith(".pdf"):
2830
image = Image.open(input_file)
29-
image.convert("RGB").save(input_file, dpi=image.info.get("dpi", (300, 300)))
31+
if "dpi" in image.info:
32+
dpi = max(image.info["dpi"])
33+
else:
34+
data = pytesseract.image_to_data(
35+
image, output_type=pytesseract.Output.DICT
36+
)
37+
heights = [
38+
data["height"][i]
39+
for i in range(len(data["text"]))
40+
if int(data["conf"][i]) > 0
41+
]
42+
if heights:
43+
median_height = sorted(heights)[len(heights) // 2]
44+
dpi = int(72 * (image.height / median_height))
45+
# if dpi does not make sense, set to 300
46+
if dpi < 50 or dpi > 1200:
47+
dpi = 300
48+
image.convert("RGB").save(input_file, dpi=image.info.get("dpi", (dpi, dpi)))
3049

3150
output_file = base / "done" / relative_path.with_suffix(".pdf")
3251
output_file.parent.mkdir(exist_ok=True, parents=True)
52+
53+
# if --image-dpi is not set and dpi is known, add it
54+
if dpi > 0 and not any(arg.startswith("--image-dpi") for arg in args):
55+
args.extend(["--image-dpi", str(dpi)])
56+
3357
subprocess.run(
3458
[
3559
"bash",

src/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ pillow==10.4.0
33
joblib==1.4.2
44
psutil
55
natsort
6+
pytesseract

0 commit comments

Comments
 (0)