Skip to content

Commit dd122d0

Browse files
feat!: adding load/unload from key (#9)
* adding load/unload from key Signed-off-by: Peter Staar <[email protected]> * updated tests Signed-off-by: Peter Staar <[email protected]> * all fixed, still need to clean all commented out code Signed-off-by: Peter Staar <[email protected]> * ran pre-commit hooks Signed-off-by: Peter Staar <[email protected]> * allow more tabulate versions Signed-off-by: Michele Dolfi <[email protected]> * renamed some key functions Signed-off-by: Peter Staar <[email protected]> * ran pre-commit hooks (2) Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent 89211eb commit dd122d0

18 files changed

+32295
-209
lines changed

README.md

Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,85 @@ pip install docling-parse
2121

2222
Convert a PDF
2323

24-
```sh
24+
```python
2525
from docling_parse.docling_parse import pdf_parser
2626

27+
# Do this only once to load fonts (avoid initialising it many times)
2728
parser = pdf_parser()
28-
doc = parser.find_cells("mydoc.pdf")
2929

30-
for i, page in enumerate(doc["pages"]):
31-
for j, cell in enumerate(page["cells"]):
32-
print(i, "\t", j, "\t", cell["content"]["rnormalized"])
30+
# parser.set_loglevel(1) # 1=error, 2=warning, 3=success, 4=info
31+
32+
doc_file = "my-doc.pdf" # filename
33+
doc_key = f"key={pdf_doc}" # unique document key (eg hash, UUID, etc)
34+
35+
# Load the document from file using filename doc_file. This only loads
36+
# the QPDF document, but no extracted data
37+
success = parser.load_document(doc_key, doc_file)
38+
39+
# Open the file in binary mode and read its contents
40+
# with open(pdf_doc, "rb") as file:
41+
# file_content = file.read()
42+
43+
# Create a BytesIO object and write the file contents to it
44+
# bytes_io = io.BytesIO(file_content)
45+
# success = parser.load_document_from_bytesio(doc_key, bytes_io)
46+
47+
# Parse the entire document in one go, easier, but could require
48+
# a lot (more) memory as parsing page-by-page
49+
# json_doc = parser.parse_pdf_from_key(doc_key)
50+
51+
# Get number of pages
52+
num_pages = parser.number_of_pages(doc_key)
53+
54+
# Parse page by page to minimize memory footprint
55+
for page in range(0, num_pages):
56+
57+
# Internal memory for page is auto-deleted after this call.
58+
# No need to unload a specifc page
59+
json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)
60+
61+
# parsed page is the first one!
62+
json_page = json_doc["pages"][0]
63+
64+
page_dimensions = [json_page["dimensions"]["width"], json_page["dimensions"]["height"]]
65+
66+
# find text cells
67+
cells=[]
68+
for cell_id,cell in enumerate(json_page["cells"]):
69+
cells.append([page,
70+
cell_id,
71+
cell["content"]["rnormalized"], # text
72+
cell["box"]["device"][0], # x0 (lower left x)
73+
cell["box"]["device"][1], # y0 (lower left y)
74+
cell["box"]["device"][2], # x1 (upper right x)
75+
cell["box"]["device"][3], # y1 (upper right y)
76+
])
77+
78+
# find bitmap images
79+
images=[]
80+
for image_id,image in enumerate(json_page["images"]):
81+
images.append([page,
82+
image_id,
83+
image["box"][0], # x0 (lower left x)
84+
image["box"][1], # y0 (lower left y)
85+
image["box"][2], # x1 (upper right x)
86+
image["box"][3], # y1 (upper right y)
87+
])
88+
89+
# find paths
90+
paths=[]
91+
for path_id,path in enumerate(json_page["paths"]):
92+
paths.append([page,
93+
path_id,
94+
path["x-values"], # array of x values
95+
path["y-values"], # array of y values
96+
])
97+
98+
# Unload the (QPDF) document and buffers
99+
parser.unload_document(doc_key)
100+
101+
# Unloads everything at once
102+
# parser.unload_documents()
33103
```
34104

35105
Use the CLI

app/pybind_parse.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,25 @@ PYBIND11_MODULE(docling_parse, m) {
1616

1717
.def("set_loglevel", &docling::docling_parser::set_loglevel)
1818

19+
.def("is_loaded", &docling::docling_parser::is_loaded)
20+
.def("list_loaded_keys", &docling::docling_parser::list_loaded_keys)
21+
22+
.def("load_document", &docling::docling_parser::load_document)
23+
.def("load_document_from_bytesio", &docling::docling_parser::load_document_from_bytesio)
24+
25+
.def("unload_document", &docling::docling_parser::unload_document)
1926
.def("unload_documents", &docling::docling_parser::unload_documents)
2027

28+
.def("number_of_pages", &docling::docling_parser::number_of_pages)
29+
30+
.def("parse_pdf_from_key",
31+
pybind11::overload_cast<std::string>(&docling::docling_parser::parse_pdf_from_key),
32+
"parse pdf-document using doc-key into json")
33+
34+
.def("parse_pdf_from_key_on_page",
35+
&docling::docling_parser::parse_pdf_from_key_on_page,
36+
"parse specific page in pdf-document using doc-key from path into json")
37+
/*
2138
.def("find_cells",
2239
pybind11::overload_cast<std::string>(&docling::docling_parser::find_cells),
2340
"parse pdf-document from path into json")
@@ -32,5 +49,6 @@ PYBIND11_MODULE(docling_parse, m) {
3249
3350
.def("find_cells_from_bytesio_on_page",
3451
&docling::docling_parser::find_cells_from_bytesio_on_page,
35-
"parse pdf-document from a BytesIO object for a specific page");
52+
"parse pdf-document from a BytesIO object for a specific page")*/
53+
;
3654
}

docling_parse/run.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import io
33
import os
44

5+
from tabulate import tabulate
6+
57
# from docling_parse.docling_parse import pdf_parser
68
import docling_parse
79
from docling_parse import pdf_parser
@@ -28,6 +30,76 @@ def main():
2830
# Print the path to the PDF file (or add your processing logic here)
2931

3032
parser = docling_parse.pdf_parser()
33+
34+
doc_file = args.pdf # filename
35+
doc_key = f"key={args.pdf}" # unique document key (eg hash, UUID, etc)
36+
37+
# Load the document
38+
success = parser.load_document(doc_key, doc_file)
39+
40+
# Get number of pages
41+
num_pages = parser.number_of_pages(doc_key)
42+
43+
# Parse page by page to minimize memory footprint
44+
for page in range(0, num_pages):
45+
json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)
46+
json_page = json_doc["pages"][0]
47+
48+
page_dimensions = [
49+
json_page["dimensions"]["width"],
50+
json_page["dimensions"]["height"],
51+
]
52+
53+
# find text cells
54+
cells = []
55+
for cell_id, cell in enumerate(json_page["cells"]):
56+
cells.append(
57+
[
58+
page,
59+
cell_id,
60+
cell["content"]["rnormalized"], # text
61+
cell["box"]["device"][0], # x0 (lower left x)
62+
cell["box"]["device"][1], # y0 (lower left y)
63+
cell["box"]["device"][2], # x1 (upper right x)
64+
cell["box"]["device"][3], # y1 (upper right y)
65+
]
66+
)
67+
68+
print(f"cells of page: {page}")
69+
print(
70+
tabulate(cells, headers=["page", "cell-id", "text", "x0", "y0", "x1", "y1"])
71+
)
72+
73+
# find bitmap images
74+
images = []
75+
for image_id, image in enumerate(json_page["images"]):
76+
images.append(
77+
[
78+
page,
79+
image_id,
80+
image["box"][0], # x0 (lower left x)
81+
image["box"][1], # y0 (lower left y)
82+
image["box"][2], # x1 (upper right x)
83+
image["box"][3], # y1 (upper right y)
84+
]
85+
)
86+
87+
# find paths
88+
paths = []
89+
for path_id, path in enumerate(json_page["paths"]):
90+
paths.append(
91+
[
92+
page,
93+
path_id,
94+
path["x-values"], # array of x values
95+
path["y-values"], # array of y values
96+
]
97+
)
98+
99+
# Unload the document
100+
parser.unload_document(doc_key)
101+
102+
"""
31103
doc = parser.find_cells(args.pdf)
32104
33105
# print(json.dumps(data, indent=2))
@@ -55,6 +127,8 @@ def main():
55127
for j, cell in enumerate(page["cells"]):
56128
print(i, "\t", j, "\t", cell["content"]["rnormalized"])
57129
130+
"""
131+
58132

59133
if __name__ == "__main__":
60134
main()

poetry.lock

Lines changed: 15 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ build = "build.py"
3434

3535
[tool.poetry.dependencies]
3636
python = "^3.9"
37+
tabulate = ">=0.9.0,<1.0.0"
3738

3839
[tool.poetry.group.dev.dependencies]
3940
pytest = "^7.4.2"

0 commit comments

Comments
 (0)