Skip to content

Commit fe3482f

Browse files
feat: add page unloading (#150)
Signed-off-by: Peter Staar <[email protected]>
1 parent 4a578a1 commit fe3482f

File tree

9 files changed

+150
-14
lines changed

9 files changed

+150
-14
lines changed

README.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ options:
9999
-p PDF, --pdf PDF Path to the PDF file
100100
```
101101

102-
103102
## Performance Benchmarks
104103

105104
### Characteristics of different parser versions
@@ -185,22 +184,26 @@ If you dont have an input file, then a template input file will be printed on th
185184

186185
To build the package, simply run (make sure [uv](https://docs.astral.sh/uv/) is [installed](https://docs.astral.sh/uv/getting-started/installation)),
187186

188-
```
187+
```sh
189188
uv sync
190189
```
191190

192-
To test the package, run:
191+
The latter will only work after a clean `git clone`. If you are developing and updating C++ code, please use,
193192

193+
```sh
194+
uv pip install --force-reinstall --no-deps -e .
194195
```
196+
197+
To test the package, run:
198+
199+
```sh
195200
uv run pytest ./tests -v -s
196201
```
197202

198-
199203
## Contributing
200204

201205
Please read [Contributing to Docling Parse](https://github.com/docling-project/docling-parse/blob/main/CONTRIBUTING.md) for details.
202206

203-
204207
## References
205208

206209
If you use Docling in your projects, please consider citing the following:

app/pybind_parse.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,36 @@ PYBIND11_MODULE(pdf_parsers, m) {
291291
Returns:
292292
bool: True if the document was successfully unloaded, False otherwise.)")
293293

294+
.def("unload_document_pages",
295+
[](docling::docling_parser_v2 &self, const std::string &key) -> bool {
296+
return self.unload_document_pages(key);
297+
},
298+
pybind11::arg("key"),
299+
R"(
300+
Unload the only the cached pages of the document by its unique key.
301+
302+
Parameters:
303+
key (str): The unique key of the document to unload.
304+
305+
Returns:
306+
bool: True if the document was successfully unloaded, False otherwise.)")
307+
308+
.def("unload_document_page",
309+
[](docling::docling_parser_v2 &self, const std::string &key, int page) -> bool {
310+
return self.unload_document_page(key, page);
311+
},
312+
pybind11::arg("key"),
313+
pybind11::arg("page"),
314+
R"(
315+
Unload a single page of the document by its unique key and page_number.
316+
317+
Parameters:
318+
key (str): The unique key of the document to unload.
319+
page (int): The page number of the document to unload.
320+
321+
Returns:
322+
bool: True if the document was successfully unloaded, False otherwise.)")
323+
294324
.def("number_of_pages",
295325
[](docling::docling_parser_v2 &self, const std::string &key) -> int {
296326
return self.number_of_pages(key);

docling_parse/pdf_parser.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ def unload(self) -> bool:
5858
else:
5959
return False
6060

61+
def unload_pages(self, page_range: tuple[int, int]):
62+
"""unload page in range [page_range[0], page_range[1]["""
63+
for page_no in range(page_range[0], page_range[1]):
64+
if page_no in self._pages:
65+
self._parser.unload_document_page(key=self._key, page=page_no)
66+
del self._pages[page_no]
67+
6168
def number_of_pages(self) -> int:
6269
if self.is_loaded():
6370
return self._parser.number_of_pages(key=self._key)

docling_parse/visualize.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,8 @@ def visualise_py(
237237
print(f"text-lines (sanitized, page_no: {page_no}):")
238238
print("\n".join(lines))
239239

240+
pdf_doc.unload_pages(page_range=(page_no, page_no + 1))
241+
240242

241243
def main():
242244

src/pybind/docling_parser_v2.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ namespace docling
3535
bool load_document_from_bytesio(std::string key, pybind11::object bytes_io);
3636

3737
bool unload_document(std::string key);
38+
bool unload_document_pages(std::string key);
39+
bool unload_document_page(std::string key, int page_num);
3840

3941
void unload_documents();
4042

@@ -254,6 +256,40 @@ namespace docling
254256
return false;
255257
}
256258

259+
bool docling_parser_v2::unload_document_page(std::string key, int page_num)
260+
{
261+
auto itr = key2doc.find(key);
262+
263+
if(itr!=key2doc.end())
264+
{
265+
decoder_ptr_type decoder_ptr = itr->second;
266+
decoder_ptr->unload_page(page_num);
267+
}
268+
else
269+
{
270+
LOG_S(ERROR) << "key not found: " << key;
271+
}
272+
273+
return false;
274+
}
275+
276+
bool docling_parser_v2::unload_document_pages(std::string key)
277+
{
278+
auto itr = key2doc.find(key);
279+
280+
if(itr!=key2doc.end())
281+
{
282+
decoder_ptr_type decoder_ptr = itr->second;
283+
decoder_ptr->unload_pages();
284+
}
285+
else
286+
{
287+
LOG_S(ERROR) << "key not found: " << key;
288+
}
289+
290+
return false;
291+
}
292+
257293
void docling_parser_v2::unload_documents()
258294
{
259295
key2doc.clear();

src/v1/proj_folders/pdf_library/qpdf/parser/font.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -911,10 +911,9 @@ namespace pdf_lib
911911
std::string name = _handle.getKey("/Subtype").getName();
912912
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t detected /Subtype: " << name;
913913

914-
//std::cout << name << "\n";
915-
914+
//if(bbox) // use the new `bool()`
916915
if(bbox.isInitialized())
917-
{
916+
{
918917
if(not fm.ascent)
919918
{
920919
fm.ascent = bbox.getArrayItem(3).getNumericValue();

src/v2/pdf_decoders/document.h

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,15 @@ namespace pdflib
2929

3030
bool process_document_from_file(std::string& _filename);
3131
bool process_document_from_bytesio(std::string& _buffer);
32-
32+
3333
void decode_document(std::string page_boundary, bool do_sanitization);
3434

3535
void decode_document(std::vector<int>& page_numbers, std::string page_boundary, bool do_sanitization);
3636

37+
bool unload_pages();
38+
39+
bool unload_page(int page_number);
40+
3741
private:
3842

3943
void update_qpdf_logger();
@@ -232,7 +236,7 @@ namespace pdflib
232236
{
233237
utils::timer page_timer;
234238

235-
pdf_decoder<PAGE> page_decoder(page);
239+
pdf_decoder<PAGE> page_decoder(page, page_number);
236240

237241
auto timings_ = page_decoder.decode_page(page_boundary, do_sanitization);
238242
update_timings(timings_, set_timer);
@@ -271,7 +275,7 @@ namespace pdflib
271275
{
272276
utils::timer page_timer;
273277

274-
pdf_decoder<PAGE> page_decoder(pages.at(page_number));
278+
pdf_decoder<PAGE> page_decoder(pages.at(page_number), page_number);
275279

276280
auto timings_ = page_decoder.decode_page(page_boundary, do_sanitization);
277281

@@ -313,6 +317,45 @@ namespace pdflib
313317
}
314318
}
315319

320+
bool pdf_decoder<DOCUMENT>::unload_page(int page_number)
321+
{
322+
if(not json_document.contains("pages"))
323+
{
324+
LOG_S(WARNING) << "json_document does not have `pages`";
325+
return false;
326+
}
327+
328+
nlohmann::json& json_pages = json_document["pages"];
329+
330+
for(int l=0; l<json_pages.size(); l++)
331+
{
332+
if((json_pages[l].is_object()) and
333+
(json_pages[l].contains("page_number")) and
334+
(json_pages[l]["page_number"]==page_number))
335+
{
336+
json_pages[l].clear();
337+
338+
nlohmann::json none;
339+
json_pages[l] = none;
340+
}
341+
}
342+
343+
return true;
344+
}
345+
346+
bool pdf_decoder<DOCUMENT>::unload_pages()
347+
{
348+
if(not json_document.contains("pages"))
349+
{
350+
LOG_S(WARNING) << "json_document does not have `pages`";
351+
return false;
352+
}
353+
354+
json_document["pages"] = nlohmann::json::array({});
355+
356+
return true;
357+
}
358+
316359
}
317360

318361
#endif

src/v2/pdf_decoders/page.h

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@ namespace pdflib
1616
{
1717
public:
1818

19-
pdf_decoder(QPDFObjectHandle page);
19+
pdf_decoder(QPDFObjectHandle page, int page_num);
2020
~pdf_decoder();
2121

22+
int get_page_number();
23+
2224
nlohmann::json get();
2325

2426
std::map<std::string, double> decode_page(std::string page_boundary, bool do_sanitization);
@@ -49,6 +51,9 @@ namespace pdflib
4951
private:
5052

5153
QPDFObjectHandle qpdf_page;
54+
55+
int page_number;
56+
5257
QPDFObjectHandle qpdf_parent_resources;
5358
QPDFObjectHandle qpdf_resources;
5459
QPDFObjectHandle qpdf_grphs;
@@ -81,19 +86,27 @@ namespace pdflib
8186
std::map<std::string, double> timings;
8287
};
8388

84-
pdf_decoder<PAGE>::pdf_decoder(QPDFObjectHandle page):
85-
qpdf_page(page)
89+
pdf_decoder<PAGE>::pdf_decoder(QPDFObjectHandle page, int page_num):
90+
qpdf_page(page),
91+
page_number(page_num)
8692
{
8793
}
8894

8995
pdf_decoder<PAGE>::~pdf_decoder()
9096
{
9197
}
98+
99+
int pdf_decoder<PAGE>::get_page_number()
100+
{
101+
return page_number;
102+
}
92103

93104
nlohmann::json pdf_decoder<PAGE>::get()
94105
{
95106
nlohmann::json result;
96107
{
108+
result["page_number"] = page_number;
109+
97110
result["annotations"] = json_annots;
98111

99112
nlohmann::json& timings_ = result["timings"];

tests/test_parse.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,9 @@ def test_reference_documents_from_filenames():
288288
img = pred_page.render_as_image(cell_unit=TextCellUnit.LINE)
289289
# img.show()
290290

291+
print(f"unloading page: {page_no}")
292+
pdf_doc.unload_pages(page_range=(page_no, page_no + 1))
293+
291294
toc: PdfTableOfContents = pdf_doc.get_table_of_contents()
292295
"""
293296
if toc is not None:

0 commit comments

Comments
 (0)