@@ -35,9 +35,11 @@ namespace docling
3535 void set_loglevel ();
3636
3737 nlohmann::json get_raw (std::string path);
38-
38+
3939 nlohmann::json find_cells (std::string path);
4040
41+ nlohmann::json find_cells_from_bytesio (pybind11::object bytes_io);
42+
4143 private:
4244
4345 pdf_lib::interface<pdf_lib::PARSER> interface;
@@ -99,36 +101,76 @@ namespace docling
99101 // remove font information ...
100102 for (int pid=0 ; pid<doc_raw[" pages" ].get_size (); pid++)
101103 {
102- /*
103- container_lib::container page = doc_raw["pages"][pid];
104- for(int cid=0; cid<page["cells"].get_size(); cid++)
105- {
106- IO::writer<IO::JSON_CONTAINER> writer;
107- std::string result = writer.to_string(page["cells"][cid]);
108- ///std::cout << "could not parse: " << result << "\n";
109-
110- try
111- {
112- nlohmann::json data = nlohmann::json::parse(result);
113- std::cout << pid << "\t" << cid << " -> parsed\n";
114- }
115- catch(...)
116- {
117- std::cout << "could not parse: " << result << "\n";
118- }
119- }
120- */
121104 doc_raw[" pages" ][pid].erase (" fonts" );
122105 }
123106
124107 IO::writer<IO::JSON_CONTAINER> writer;
125108 std::string result = writer.to_string (doc_raw);
126- // std::cout << result << "\n";
127109
128110 nlohmann::json data = nlohmann::json::parse (result);
129111 return data;
130112 }
131113
114+ nlohmann::json docling_parser::find_cells_from_bytesio (pybind11::object bytes_io)
115+ {
116+ // Check if the object is a BytesIO object
117+ if (!pybind11::hasattr (bytes_io, " read" )) {
118+
119+ throw std::runtime_error (" Expected a BytesIO object" );
120+ }
121+
122+ // Seek to the beginning of the BytesIO stream
123+ bytes_io.attr (" seek" )(0 );
124+
125+ // Read the entire content of the BytesIO stream
126+ pybind11::bytes data = bytes_io.attr (" read" )();
127+
128+ // Get a pointer to the data
129+ std::string data_str = data.cast <std::string>();
130+
131+ // Do something with the data (in this case, simply print the size)
132+ std::cout << " Read " << data_str.size () << " bytes from the BytesIO stream" << std::endl;
133+
134+ // std::string path;
135+
136+ container_lib::container doc_raw;
137+
138+ // parse the pdf file on path
139+ if (not interface.parse_pdf_page (data_str.c_str (), data_str.size (), doc_raw))
140+ {
141+ logging_lib::error (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t "
142+ << " could not parse the PDF file" ;
143+
144+ nlohmann::json data;
145+ data[" message" ] = " could not parse the PDF file" ;
146+ return data;
147+ }
148+
149+ if (not interface.clean_raw_page (doc_raw))
150+ {
151+ logging_lib::error (" pdf-parser" ) << __FILE__ << " :" << __LINE__ << " \t "
152+ << " could not clean the raw file" ;
153+
154+ nlohmann::json data;
155+ data[" message" ] = " could not clean the raw file" ;
156+ return data;
157+ }
158+
159+ interface.clean_pages (doc_raw);
160+
161+ // remove font information ...
162+ for (int pid=0 ; pid<doc_raw[" pages" ].get_size (); pid++)
163+ {
164+ doc_raw[" pages" ][pid].erase (" fonts" );
165+ }
166+
167+ IO::writer<IO::JSON_CONTAINER> writer;
168+ std::string result = writer.to_string (doc_raw);
169+
170+ nlohmann::json output = nlohmann::json::parse (result);
171+ return output;
172+ }
173+
132174}
133175
134176#endif
0 commit comments