-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutils.py
More file actions
97 lines (78 loc) · 2.98 KB
/
utils.py
File metadata and controls
97 lines (78 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from docx import Document
# from fpdf import FPDF
def create_or_empty_dir(directory):
"""
Create or empty the specified directory.
Args:
directory (str): The directory path.
"""
if os.path.exists(directory):
# Empty the directory if it already exists
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
os.remove(file_path)
else:
# Create the directory if it doesn't exist
os.makedirs(directory)
def convert_pdf_to_images(input_pdf, output_dir):
"""
Convert a PDF file to a series of images.
Args:
input_pdf (str): The path to the input PDF file.
output_dir (str): The directory to save the converted images.
"""
pages = convert_from_path(input_pdf)
# Save each page as a JPEG file using Pillow
for i, page in enumerate(pages):
image_path = os.path.join(output_dir, f"page_{i}.jpg")
page.save(image_path, "JPEG")
def extract_text_from_image(image_path):
"""
Extract text from an image using OCR (Optical Character Recognition).
Args:
image_path (str): The path to the input image file.
Returns:
str: The extracted text from the image.
"""
image = Image.open(image_path)
text = pytesseract.image_to_string(image)
return text
def create_docx_with_text(image_folder, output_docx):
"""
Create a Word document (.docx) with text extracted from images.
Args:
image_folder (str): The directory containing the input images.
output_docx (str): The path to save the output Word document.
"""
document = Document()
for filename in sorted(
os.listdir(image_folder), key=lambda x: int(x.split("_")[1].split(".")[0])
):
if filename.endswith(".png") or filename.endswith(".jpg"):
image_path = os.path.join(image_folder, filename)
text = extract_text_from_image(image_path)
text = text.encode("utf-8", "ignore").decode("latin-1", "ignore")
document.add_paragraph(text)
document.save(output_docx)
# def create_pdf_with_text(image_folder, output_pdf):
# """
# Create a PDF document with text extracted from images.
# Args:
# image_folder (str): The directory containing the input images.
# output_pdf (str): The path to save the output PDF document.
# """
# pdf = FPDF()
# for filename in sorted(
# os.listdir(image_folder), key=lambda x: int(x.split("_")[1].split(".")[0])
# ):
# if filename.endswith(".png") or filename.endswith(".jpg"):
# image_path = os.path.join(image_folder, filename)
# text = extract_text_from_image(image_path)
# pdf.add_page()
# pdf.set_font("Arial", size=12)
# pdf.cell(0, 10, txt=text, ln=1)
# pdf.output(output_pdf)