|
| 1 | +# pip install pdf2docx |
| 2 | +# Import the required modules |
| 3 | +from pdf2docx import Converter |
| 4 | + |
| 5 | + |
| 6 | +def convert_pdf_to_docx(pdf_file_path, docx_file_path): |
| 7 | + """ |
| 8 | + Converts a PDF file to a DOCX file using pdf2docx library. |
| 9 | +
|
| 10 | + Parameters: |
| 11 | + - pdf_file_path (str): The path to the input PDF file. |
| 12 | + - docx_file_path (str): The desired path for the output DOCX file. |
| 13 | +
|
| 14 | + Returns: |
| 15 | + None |
| 16 | + """ |
| 17 | + # Convert PDF to DOCX using pdf2docx library |
| 18 | + |
| 19 | + # Using the built-in function, convert the PDF file to a document file by saving it in a variable. |
| 20 | + cv = Converter(pdf_file_path) |
| 21 | + |
| 22 | + # Storing the Document in the variable's initialised path |
| 23 | + cv.convert(docx_file_path) |
| 24 | + |
| 25 | + # Conversion closure through the function close() |
| 26 | + cv.close() |
| 27 | + |
| 28 | + |
| 29 | +# Example usage |
| 30 | + |
| 31 | +# Keeping the PDF's location in a separate variable |
| 32 | +# pdf_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python.pdf" |
| 33 | +# # Maintaining the Document's path in a separate variable |
| 34 | +# docx_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python_edit.docx" |
| 35 | + |
| 36 | +# Keeping the PDF's location in a separate variable |
| 37 | +pdf_file_path = ( |
| 38 | + r"C:\Users\playn\OneDrive\Desktop\read_kar_ke_feedback_le_aur_del_kar_de.pdf" |
| 39 | +) |
| 40 | +# Maintaining the Document's path in a separate variable |
| 41 | +docx_file_path = ( |
| 42 | + r"C:\Users\playn\OneDrive\Desktop\read_kar_ke_feedback_le_aur_del_kar_de.docx" |
| 43 | +) |
| 44 | + |
| 45 | +# Call the function to convert PDF to DOCX |
| 46 | +convert_pdf_to_docx(pdf_file_path, docx_file_path) |
| 47 | + |
| 48 | +# # Error handling |
| 49 | +# # IF present then ask for permission else continue |
| 50 | + |
| 51 | + |
| 52 | +# import fitz |
| 53 | +# from docx import Document |
| 54 | +# import pytesseract |
| 55 | +# from PIL import Image |
| 56 | + |
| 57 | + |
| 58 | +# class PDFToDocxConverter: |
| 59 | +# """ |
| 60 | +# A class to convert PDF to DOCX with OCR using PyMuPDF, pytesseract, and python-docx. |
| 61 | +# """ |
| 62 | + |
| 63 | +# def __init__(self, pdf_path, docx_path): |
| 64 | +# """ |
| 65 | +# Initializes the PDFToDocxConverter. |
| 66 | + |
| 67 | +# Parameters: |
| 68 | +# - pdf_path (str): The path to the input PDF file. |
| 69 | +# - docx_path (str): The desired path for the output DOCX file. |
| 70 | +# """ |
| 71 | +# self.pdf_path = pdf_path |
| 72 | +# self.docx_path = docx_path |
| 73 | + |
| 74 | +# def convert_pdf_to_docx(self): |
| 75 | +# """ |
| 76 | +# Converts the PDF to DOCX with OCR and saves the result. |
| 77 | +# """ |
| 78 | +# doc = Document() |
| 79 | + |
| 80 | +# with fitz.open(self.pdf_path) as pdf: |
| 81 | +# for page_num in range(pdf.page_count): |
| 82 | +# page = pdf[page_num] |
| 83 | +# image_list = page.get_images(full=True) |
| 84 | + |
| 85 | +# for img_index, img_info in enumerate(image_list): |
| 86 | +# img = page.get_pixmap(image_index=img_index) |
| 87 | +# img_path = f"temp_image_{img_index}.png" |
| 88 | +# img.writePNG(img_path) |
| 89 | + |
| 90 | +# text = pytesseract.image_to_string(Image.open(img_path)) |
| 91 | +# doc.add_paragraph(text) |
| 92 | + |
| 93 | +# doc.save(self.docx_path) |
| 94 | + |
| 95 | + |
| 96 | +# if __name__ == "__main__": |
| 97 | +# # Example usage |
| 98 | +# # Keeping the PDF's location in a separate variable |
| 99 | +# pdf_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python.pdf" |
| 100 | +# # Maintaining the Document's path in a separate variable |
| 101 | +# docx_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python_edit.docx" |
| 102 | + |
| 103 | +# converter = PDFToDocxConverter(pdf_file_path, docx_file_path) |
| 104 | +# # converter.convert_pdf_to_docx() |
| 105 | + |
| 106 | + |
| 107 | +# # failed experiment. |
0 commit comments