Skip to content

Commit 4559c14

Browse files
authored
Merge pull request #115 from aspose-pdf-cloud/refactored-parser
Refactored Parser use cases
2 parents dc21faf + bca5a04 commit 4559c14

File tree

7 files changed

+220
-0
lines changed

7 files changed

+220
-0
lines changed

Uses-Cases/Parser/get_fdf.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from paresr_helpers import ParesrHelper
2+
from pathlib import Path
3+
import logging
4+
5+
class ExportFormToFDF:
6+
"""Class for extracting PDF form fields into FDF using Aspose PDF Cloud API."""
7+
def __init__(self, helper: ParesrHelper):
8+
self.helper = helper
9+
10+
def Extract(self, documentName: str, outputFDFName: str, localFolder: Path, remoteFolder: str ):
11+
self.helper.upload_document(documentName, remoteFolder)
12+
13+
fdfPath = str(Path.joinpath(Path(remoteFolder), outputFDFName))
14+
opts = {
15+
"folder": remoteFolder
16+
}
17+
response = self.helper.pdf_api.put_export_fields_from_pdf_to_fdf_in_storage(documentName, fdfPath, **opts)
18+
if response.code != 200:
19+
logging.error("ExportFormToFDF(): Unexpected error!")
20+
else:
21+
logging.info(f"ExportFormToFDF(): Pdf document '{documentName}' form fields successfully exported to '{outputFDFName}' file.")
22+
self.helper.downloadFile(outputFDFName, outputFDFName, localFolder, remoteFolder, "")

Uses-Cases/Parser/get_images.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from paresr_helpers import ParesrHelper
2+
from pathlib import Path
3+
import shutil
4+
import logging
5+
6+
class GetImages:
7+
"""Class for extracting images from PDF document page using Aspose PDF Cloud API."""
8+
def __init__(self, helper: ParesrHelper):
9+
self.helper = helper
10+
11+
def Extract(self, documentName: str, pageNumber: int, localFolder: Path, remoteFolder: Path):
12+
self.helper.upload_document(documentName, remoteFolder)
13+
14+
opts = {
15+
"folder": remoteFolder
16+
}
17+
respImages = self.helper.pdf_api.get_images(documentName, pageNumber, **opts)
18+
if respImages.code != 200:
19+
logging.error("GetImages(): Unexpected error!")
20+
else:
21+
for img in respImages.images.list:
22+
response = self.helper.pdf_api.get_image_extract_as_png(documentName, img.id, **opts)
23+
24+
logging.info(f"GetImages(): Images '{img.id}' successfully extracted from the document '{documentName}'.")
25+
local_path = localFolder / ( img.id + '.png' )
26+
shutil.move(response, str(local_path))

Uses-Cases/Parser/get_tables.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from paresr_helpers import ParesrHelper
2+
from pathlib import Path
3+
import json
4+
import logging
5+
6+
class GetTables:
7+
"""Class for extracting tables from PDF document using Aspose PDF Cloud API."""
8+
def __init__(self, helper: ParesrHelper):
9+
self.helper = helper
10+
11+
def Extract(self, documentName: str, localFolder: Path, remoteFolder: Path):
12+
self.helper.upload_document(documentName, remoteFolder)
13+
14+
opts = {
15+
"folder": remoteFolder
16+
}
17+
respTables = self.helper.pdf_api.get_document_tables(documentName, **opts)
18+
if respTables.code != 200:
19+
logging.error("GetTables(): Unexpected error!")
20+
else:
21+
localJson = Path.joinpath(localFolder, "tables_objects.json")
22+
with open(str(localJson), "w", encoding="utf-8") as localFile:
23+
for tab in respTables.tables.list:
24+
response = self.helper.pdf_api.get_table(documentName, tab.id, **opts)
25+
if response.code != 200:
26+
logging.error("GetTextBoxes(): Unexpected error!")
27+
else:
28+
logging.info(f"GetTabels(): Table '{tab.id}' successfully extracted from the document '{documentName}'.")
29+
json.dump(tab, localFile, ensure_ascii=False,default=str)
30+
localFile.write("\n*********************\n")

Uses-Cases/Parser/get_textboxes.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from paresr_helpers import ParesrHelper
2+
from pathlib import Path
3+
import json
4+
import logging
5+
6+
class GetTextBoxes:
7+
"""Class for extracting text boxes from PDF document using Aspose PDF Cloud API."""
8+
def __init__(self, helper: ParesrHelper):
9+
self.helper = helper
10+
11+
def Extract(self, documentName: str, localFolder: Path, remoteFolder: Path):
12+
self.helper.upload_document(documentName, remoteFolder)
13+
14+
opts = {
15+
"folder": remoteFolder
16+
}
17+
respTextBoxes = self.helper.pdf_api.get_document_text_box_fields(documentName, **opts)
18+
if respTextBoxes.code != 200:
19+
logging.error("GetTextBoxes(): Unexpected error!")
20+
else:
21+
localJson = Path.joinpath(localFolder, "text_box_objects.json")
22+
with open(str(localJson), "w", encoding="utf-8") as localFile:
23+
for textBox in respTextBoxes.fields.list:
24+
response = self.helper.pdf_api.get_text_box_field(documentName, textBox.full_name, **opts)
25+
if response.code != 200:
26+
logging.error("GetTextBoxes(): Unexpected error!")
27+
else:
28+
logging.info(f"GetTextBoxes(): TextBox field '{textBox.full_name}' successfully extracted from the document '{documentName}'.")
29+
json.dump(textBox, localFile, ensure_ascii=False,default=str)
30+
localFile.write("\n*********************\n")

Uses-Cases/Parser/get_xml.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from paresr_helpers import ParesrHelper, Config
2+
from pathlib import Path
3+
import logging
4+
5+
class ExportFormToXXML:
6+
"""Class for extracting PDF form fields into XML using Aspose PDF Cloud API."""
7+
def __init__(self, helper: ParesrHelper):
8+
self.helper = helper
9+
10+
def Extract(self, documentName: str, outputXMLName: str, localFolder: Path, remoteFolder: str ):
11+
self.helper.upload_document(documentName, remoteFolder)
12+
13+
xmlPath = str(Path.joinpath(Path(remoteFolder), outputXMLName))
14+
opts = {
15+
"folder": remoteFolder
16+
}
17+
response = self.helper.pdf_api.put_export_fields_from_pdf_to_xml_in_storage(documentName, xmlPath, **opts)
18+
if response.code != 200:
19+
logging.error("ExportFormToXM(): Unexpected error!")
20+
else:
21+
logging.info(f"ExportFormToXML(): Pdf document '{documentName}' form fields successfully exported to '{outputXMLName}' file.")
22+
self.helper.downloadFile(outputXMLName, outputXMLName, localFolder, remoteFolder, "")
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import shutil
2+
import json
3+
import logging
4+
from pathlib import Path
5+
from asposepdfcloud import ApiClient, PdfApi
6+
7+
# Configure logging
8+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
9+
10+
11+
class Config:
12+
"""Configuration parameters."""
13+
CREDENTIALS_FILE = Path(r"..s\\credentials.json")
14+
LOCAL_FOLDER = Path(r"C:\Samples")
15+
REMOTE_TEMP_FOLDER = "TempPdfCloud"
16+
PDF_DOCUMENT_NAME = "sample.pdf"
17+
XML_OUTPUT_FILE = "output_sample.xml"
18+
FDF_OUTPUT_FILE = "output_sample.fdf"
19+
LOCAL_RESULT_DOCUMENT_NAME = "output_sample.pdf"
20+
PAGE_NUMBER = 1
21+
22+
23+
class ParesrHelper:
24+
"""Class with helper methods and properties for Parser"""
25+
26+
def __init__(self, credentials_file: Path = Config.CREDENTIALS_FILE):
27+
self.pdf_api = None
28+
self._init_api(credentials_file)
29+
30+
def _init_api(self, credentials_file: Path):
31+
"""Initialize the API client."""
32+
try:
33+
with credentials_file.open("r", encoding="utf-8") as file:
34+
credentials = json.load(file)
35+
api_key, app_id = credentials.get("key"), credentials.get("id")
36+
if not api_key or not app_id:
37+
raise ValueError("Error: Missing API keys in the credentials file.")
38+
self.pdf_api = PdfApi(ApiClient(api_key, app_id))
39+
except (FileNotFoundError, json.JSONDecodeError, ValueError) as e:
40+
logging.error(f"Failed to load credentials: {e}")
41+
42+
def upload_document(self, documentName: str, remoteFolder: str):
43+
"""Upload a PDF document to the Aspose Cloud server."""
44+
if self.pdf_api:
45+
file_path = Config.LOCAL_FOLDER / documentName
46+
try:
47+
if remoteFolder == None:
48+
self.pdf_api.upload_file(documentName, str(file_path))
49+
else:
50+
opts = { "folder": remoteFolder }
51+
self.pdf_api.upload_file(remoteFolder + '/' + documentName, file_path)
52+
logging.info(f"File {documentName} uploaded successfully.")
53+
except Exception as e:
54+
logging.error(f"Failed to upload file: {e}")
55+
56+
def downloadFile(self, document: str, outputDocument: str, localFolder: Path, remoteFolder: str, output_prefix: str):
57+
"""Download the processed PDF document from the Aspose Cloud server."""
58+
if self.pdf_api:
59+
try:
60+
temp_file = self.pdf_api.download_file(remoteFolder + '/' + document)
61+
local_path = localFolder / ( output_prefix + outputDocument )
62+
shutil.move(temp_file, str(local_path))
63+
logging.info(f"download_result(): File successfully downloaded: {local_path}")
64+
except Exception as e:
65+
logging.error(f"download_result(): Failed to download file: {e}")
66+

Uses-Cases/Parser/parser_launch.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from paresr_helpers import ParesrHelper, Config
2+
from get_xml import ExportFormToXXML
3+
from get_fdf import ExportFormToFDF
4+
from get_images import GetImages
5+
from get_tables import GetTables
6+
from get_textboxes import GetTextBoxes
7+
8+
if __name__ == "__main__":
9+
helper = ParesrHelper(Config.CREDENTIALS_FILE)
10+
11+
xmlExtractor = ExportFormToXXML(helper)
12+
xmlExtractor.Extract(Config.PDF_DOCUMENT_NAME, Config.XML_OUTPUT_FILE, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER)
13+
14+
fdfExtractor = ExportFormToFDF(helper)
15+
fdfExtractor.Extract(Config.PDF_DOCUMENT_NAME, Config.FDF_OUTPUT_FILE, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER)
16+
17+
getImages = GetImages(helper)
18+
getImages.Extract(Config.PDF_DOCUMENT_NAME, Config.PAGE_NUMBER, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER)
19+
20+
getTables = GetTables(helper)
21+
getTables.Extract(Config.PDF_DOCUMENT_NAME, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER)
22+
23+
getTextBoxes = GetTextBoxes(helper)
24+
getTextBoxes.Extract(Config.PDF_DOCUMENT_NAME, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER)

0 commit comments

Comments
 (0)