Skip to content

Commit 195777b

Browse files
feat: add reading from BytesIO (#6)
* added reading from BytesIO Signed-off-by: Peter Staar <[email protected]> * run pre-commit hooks Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 58e30f0 commit 195777b

File tree

11 files changed

+212
-37
lines changed

11 files changed

+212
-37
lines changed

app/pybind_parse.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22

33
#include <pybind11/pybind11.h>
44
#include <pybind11/stl.h>
5+
#include <pybind11/buffer_info.h>
6+
//#include <pybind11/numpy.h>
57

68
#include <pybind/utils/pybind11_json.h>
9+
710
#include <pybind/docling_parser.h>
811

912
PYBIND11_MODULE(docling_parse, m) {
@@ -13,6 +16,8 @@ PYBIND11_MODULE(docling_parse, m) {
1316
.def("set_loglevel", &docling::docling_parser::set_loglevel)
1417
.def("find_cells",
1518
pybind11::overload_cast<std::string>(&docling::docling_parser::find_cells),
16-
"parse pdf-document from path into json");
17-
19+
"parse pdf-document from path into json")
20+
.def("find_cells_from_bytesio",
21+
&docling::docling_parser::find_cells_from_bytesio,
22+
"A function to read a BytesIO object");
1823
}

build.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,4 @@ def build_local(multi_threaded=True):
4747

4848
if "__main__" == __name__:
4949

50-
build_local(multi_threaded=False)
50+
build_local(multi_threaded=True)

docling_parse/run.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import argparse
2+
import io
23
import os
34

4-
from docling_parse.docling_parse import pdf_parser
5+
# from docling_parse.docling_parse import pdf_parser
6+
import docling_parse
7+
from docling_parse import pdf_parser
58

69

710
def main():
@@ -24,12 +27,28 @@ def main():
2427

2528
# Print the path to the PDF file (or add your processing logic here)
2629

27-
parser = pdf_parser()
30+
parser = docling_parse.pdf_parser()
2831
doc = parser.find_cells(args.pdf)
2932

3033
# print(json.dumps(data, indent=2))
31-
print(doc.keys())
34+
print("keys: ", doc.keys())
35+
for i, page in enumerate(doc["pages"]):
36+
print(page.keys())
37+
38+
for j, cell in enumerate(page["cells"]):
39+
print(i, "\t", j, "\t", cell["content"]["rnormalized"])
3240

41+
# Open the file in binary mode and read its contents
42+
with open(args.pdf, "rb") as file:
43+
file_content = file.read()
44+
45+
# Create a BytesIO object and write the file contents to it
46+
bytes_io = io.BytesIO(file_content)
47+
48+
doc = parser.find_cells_from_bytesio(bytes_io)
49+
50+
# print(json.dumps(data, indent=2))
51+
print("keys: ", doc.keys())
3352
for i, page in enumerate(doc["pages"]):
3453
print(page.keys())
3554

src/proj_folders/pdf_interface/parser.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ namespace pdf_lib
3939
bool parse_pdf_page(std::string id,
4040
container_lib::container& raw_page);
4141

42+
bool parse_pdf_page(const char* buffer, std::size_t size,
43+
container_lib::container& raw_page);
44+
4245
bool clean_raw_page(container_lib::container& raw_page);
4346

4447
void clean_pages(container_lib::container& raw_doc);
@@ -500,6 +503,44 @@ namespace pdf_lib
500503
return true;
501504
}
502505

506+
bool interface<PARSER>::parse_pdf_page(const char* buffer, std::size_t size,
507+
container_lib::container &raw_page)
508+
{
509+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
510+
511+
raw_page.clear();
512+
513+
pdf_lib::core::object<pdf_lib::core::DOCUMENT> doc;
514+
515+
//try
516+
{
517+
std::string desc = "parsing document buffer via BytesIO";
518+
519+
pdf_lib::qpdf::parser<pdf_lib::core::DOCUMENT> parser(doc);
520+
parser.load_buffer(desc.c_str(), buffer, size).process_all();
521+
}
522+
//catch(...)
523+
//{
524+
//logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
525+
// << "\t ERROR in pdf-parsing !!\n";
526+
//return false;
527+
//}
528+
529+
try
530+
{
531+
pdf_lib::core::writer writer;
532+
writer.execute(doc, raw_page);
533+
}
534+
catch (...)
535+
{
536+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__
537+
<< "\t ERROR in conversion pdf_lib::core::DOCUMENT --> container !!\n";
538+
return false;
539+
}
540+
541+
return true;
542+
}
543+
503544
bool interface<PARSER>::clean_raw_page(container_lib::container& raw_page)
504545
{
505546
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

src/proj_folders/pdf_library.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,12 @@ Date: 18/08/2017
1515
#ifndef PDF_LIBRARY_H
1616
#define PDF_LIBRARY_H
1717

18-
#ifdef __HAVE_QPDF
18+
//#ifdef __HAVE_QPDF
19+
20+
#define POINTERHOLDER_TRANSITION 0
1921
#include <qpdf/QPDF.hh>
20-
#endif
22+
23+
//#endif
2124

2225
//#include "logging_library.h"
2326
//#include "container_library.h"

src/proj_folders/pdf_library/core/object/stream.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Date: 18/08/2017
1515
#ifndef PDF_LIBRARY_OBJECT_XOBJECT
1616
#define PDF_LIBRARY_OBJECT_XOBJECT
1717

18-
#include<qpdf/QPDF.hh>
18+
//#include<qpdf/QPDF.hh>
1919

2020
#include<pdf_library/core.h>
2121

src/proj_folders/pdf_library/core/object/xobject.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Date: 18/08/2017
1515
#ifndef PDF_LIBRARY_CORE_OBJECT_XOBJECT
1616
#define PDF_LIBRARY_CORE_OBJECT_XOBJECT
1717

18-
#include<qpdf/QPDF.hh>
18+
//#include<qpdf/QPDF.hh>
1919

2020
#include<pdf_library/core.h>
2121

src/proj_folders/pdf_library/qpdf.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Date: 18/08/2017
1818
#include <iostream>
1919
#include <iomanip>
2020

21-
#include <qpdf/QPDF.hh>
21+
//#include <qpdf/QPDF.hh>
2222

2323
namespace pdf_lib
2424
{

src/proj_folders/pdf_library/qpdf/parser/document.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,21 +52,24 @@ namespace pdf_lib
5252

5353
parser<core::DOCUMENT>::parser(core::object<core::DOCUMENT>& doc) :
5454
core::parser<core::DOCUMENT>(doc)
55-
{
55+
{
56+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
5657
}
5758

5859
parser<core::DOCUMENT>::~parser()
5960
{}
6061

6162
parser<core::DOCUMENT>& parser<core::DOCUMENT>::load_document(const std::string file)
6263
{
64+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
6365
try
6466
{
6567
_qpdf.processFile(file.c_str());
6668
}
6769
catch (std::exception & e)
6870
{
69-
std::cerr << e.what() << std::endl;
71+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << e.what();
72+
//std::cerr << e.what() << std::endl;
7073
std::exit(2);
7174
}
7275

@@ -77,14 +80,18 @@ namespace pdf_lib
7780
char const* buf,
7881
size_t length)
7982
{
83+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
84+
85+
8086
try
8187
{
8288
_qpdf.processMemoryFile(description,
8389
buf, length);
8490
}
8591
catch(std::exception & e)
8692
{
87-
std::cerr << e.what() << std::endl;
93+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << e.what();
94+
//std::cerr << e.what() << std::endl;
8895
std::exit(2);
8996
}
9097

@@ -93,12 +100,14 @@ namespace pdf_lib
93100

94101
void parser<core::DOCUMENT>::parse()
95102
{
96-
logging_lib::info("pdf-parser") << "qpdf::parser<core::DOCUMENT>::parse()";
103+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
97104
process_all();
98105
}
99106

100107
core::object<core::PAGE> & parser<core::DOCUMENT>::process_page(size_t index)
101108
{
109+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
110+
102111
core::object<core::PAGE>& page = object().get_page(index);
103112
page.path = "/root";
104113

@@ -112,6 +121,8 @@ namespace pdf_lib
112121

113122
core::object<core::DOCUMENT> & parser<core::DOCUMENT>::process_all()
114123
{
124+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
125+
115126
for(QPDFObjectHandle handle : _qpdf.getAllPages())
116127
{
117128
core::object<core::PAGE>& page = object().get_page();

src/pybind/docling_parser.h

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@ namespace docling
3535
void set_loglevel();
3636

3737
nlohmann::json get_raw(std::string path);
38-
38+
3939
nlohmann::json find_cells(std::string path);
4040

41+
nlohmann::json find_cells_from_bytesio(pybind11::object bytes_io);
42+
4143
private:
4244

4345
pdf_lib::interface<pdf_lib::PARSER> interface;
@@ -99,36 +101,76 @@ namespace docling
99101
// remove font information ...
100102
for(int pid=0; pid<doc_raw["pages"].get_size(); pid++)
101103
{
102-
/*
103-
container_lib::container page = doc_raw["pages"][pid];
104-
for(int cid=0; cid<page["cells"].get_size(); cid++)
105-
{
106-
IO::writer<IO::JSON_CONTAINER> writer;
107-
std::string result = writer.to_string(page["cells"][cid]);
108-
///std::cout << "could not parse: " << result << "\n";
109-
110-
try
111-
{
112-
nlohmann::json data = nlohmann::json::parse(result);
113-
std::cout << pid << "\t" << cid << " -> parsed\n";
114-
}
115-
catch(...)
116-
{
117-
std::cout << "could not parse: " << result << "\n";
118-
}
119-
}
120-
*/
121104
doc_raw["pages"][pid].erase("fonts");
122105
}
123106

124107
IO::writer<IO::JSON_CONTAINER> writer;
125108
std::string result = writer.to_string(doc_raw);
126-
//std::cout << result << "\n";
127109

128110
nlohmann::json data = nlohmann::json::parse(result);
129111
return data;
130112
}
131113

114+
nlohmann::json docling_parser::find_cells_from_bytesio(pybind11::object bytes_io)
115+
{
116+
// Check if the object is a BytesIO object
117+
if (!pybind11::hasattr(bytes_io, "read")) {
118+
119+
throw std::runtime_error("Expected a BytesIO object");
120+
}
121+
122+
// Seek to the beginning of the BytesIO stream
123+
bytes_io.attr("seek")(0);
124+
125+
// Read the entire content of the BytesIO stream
126+
pybind11::bytes data = bytes_io.attr("read")();
127+
128+
// Get a pointer to the data
129+
std::string data_str = data.cast<std::string>();
130+
131+
// Do something with the data (in this case, simply print the size)
132+
std::cout << "Read " << data_str.size() << " bytes from the BytesIO stream" << std::endl;
133+
134+
//std::string path;
135+
136+
container_lib::container doc_raw;
137+
138+
// parse the pdf file on path
139+
if (not interface.parse_pdf_page(data_str.c_str(), data_str.size(), doc_raw))
140+
{
141+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
142+
<< "could not parse the PDF file";
143+
144+
nlohmann::json data;
145+
data["message"] = "could not parse the PDF file";
146+
return data;
147+
}
148+
149+
if (not interface.clean_raw_page(doc_raw))
150+
{
151+
logging_lib::error("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t"
152+
<< "could not clean the raw file";
153+
154+
nlohmann::json data;
155+
data["message"] = "could not clean the raw file";
156+
return data;
157+
}
158+
159+
interface.clean_pages(doc_raw);
160+
161+
// remove font information ...
162+
for(int pid=0; pid<doc_raw["pages"].get_size(); pid++)
163+
{
164+
doc_raw["pages"][pid].erase("fonts");
165+
}
166+
167+
IO::writer<IO::JSON_CONTAINER> writer;
168+
std::string result = writer.to_string(doc_raw);
169+
170+
nlohmann::json output = nlohmann::json::parse(result);
171+
return output;
172+
}
173+
132174
}
133175

134176
#endif

0 commit comments

Comments
 (0)