Skip to content

Commit a519e72

Browse files
add: pdf_to_docx.py program
failed program, accuracy low, very low. Should use pytesseract, probably.
1 parent 7f7256e commit a519e72

File tree

1 file changed

+107
-0
lines changed

1 file changed

+107
-0
lines changed
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# pip install pdf2docx
2+
# Import the required modules
3+
from pdf2docx import Converter
4+
5+
6+
def convert_pdf_to_docx(pdf_file_path, docx_file_path):
7+
"""
8+
Converts a PDF file to a DOCX file using pdf2docx library.
9+
10+
Parameters:
11+
- pdf_file_path (str): The path to the input PDF file.
12+
- docx_file_path (str): The desired path for the output DOCX file.
13+
14+
Returns:
15+
None
16+
"""
17+
# Convert PDF to DOCX using pdf2docx library
18+
19+
# Using the built-in function, convert the PDF file to a document file by saving it in a variable.
20+
cv = Converter(pdf_file_path)
21+
22+
# Storing the Document in the variable's initialised path
23+
cv.convert(docx_file_path)
24+
25+
# Conversion closure through the function close()
26+
cv.close()
27+
28+
29+
# Example usage
30+
31+
# Keeping the PDF's location in a separate variable
32+
# pdf_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python.pdf"
33+
# # Maintaining the Document's path in a separate variable
34+
# docx_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python_edit.docx"
35+
36+
# Keeping the PDF's location in a separate variable
37+
pdf_file_path = (
38+
r"C:\Users\playn\OneDrive\Desktop\read_kar_ke_feedback_le_aur_del_kar_de.pdf"
39+
)
40+
# Maintaining the Document's path in a separate variable
41+
docx_file_path = (
42+
r"C:\Users\playn\OneDrive\Desktop\read_kar_ke_feedback_le_aur_del_kar_de.docx"
43+
)
44+
45+
# Call the function to convert PDF to DOCX
46+
convert_pdf_to_docx(pdf_file_path, docx_file_path)
47+
48+
# # Error handling
49+
# # IF present then ask for permission else continue
50+
51+
52+
# import fitz
53+
# from docx import Document
54+
# import pytesseract
55+
# from PIL import Image
56+
57+
58+
# class PDFToDocxConverter:
59+
# """
60+
# A class to convert PDF to DOCX with OCR using PyMuPDF, pytesseract, and python-docx.
61+
# """
62+
63+
# def __init__(self, pdf_path, docx_path):
64+
# """
65+
# Initializes the PDFToDocxConverter.
66+
67+
# Parameters:
68+
# - pdf_path (str): The path to the input PDF file.
69+
# - docx_path (str): The desired path for the output DOCX file.
70+
# """
71+
# self.pdf_path = pdf_path
72+
# self.docx_path = docx_path
73+
74+
# def convert_pdf_to_docx(self):
75+
# """
76+
# Converts the PDF to DOCX with OCR and saves the result.
77+
# """
78+
# doc = Document()
79+
80+
# with fitz.open(self.pdf_path) as pdf:
81+
# for page_num in range(pdf.page_count):
82+
# page = pdf[page_num]
83+
# image_list = page.get_images(full=True)
84+
85+
# for img_index, img_info in enumerate(image_list):
86+
# img = page.get_pixmap(image_index=img_index)
87+
# img_path = f"temp_image_{img_index}.png"
88+
# img.writePNG(img_path)
89+
90+
# text = pytesseract.image_to_string(Image.open(img_path))
91+
# doc.add_paragraph(text)
92+
93+
# doc.save(self.docx_path)
94+
95+
96+
# if __name__ == "__main__":
97+
# # Example usage
98+
# # Keeping the PDF's location in a separate variable
99+
# pdf_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python.pdf"
100+
# # Maintaining the Document's path in a separate variable
101+
# docx_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python_edit.docx"
102+
103+
# converter = PDFToDocxConverter(pdf_file_path, docx_file_path)
104+
# # converter.convert_pdf_to_docx()
105+
106+
107+
# # failed experiment.

0 commit comments

Comments
 (0)