Skip to content

Commit 48451ad

Browse files
feat: fixed the v2 parser to only return the pages that are requested (#47)
* fixed the v2 parser to only return the pages that are requested Signed-off-by: Peter Staar <[email protected]> * updated the visualize script Signed-off-by: Peter Staar <[email protected]> * fixed the default args for compilation Signed-off-by: Peter Staar <[email protected]> * put std::make_pair to avoid warnings Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 836571a commit 48451ad

32 files changed

+568
-2024476
lines changed

app/parse_v2.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ void set_loglevel(std::string level)
2525
//loguru::set_verbosity(loguru::Verbosity_ERROR);
2626
}
2727
else
28-
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR; {
29-
30-
}
28+
{
29+
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
30+
}
3131
}
3232

3333
nlohmann::json create_config(std::filesystem::path ifile,

docling_parse/visualize.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from tabulate import tabulate
77

8-
from docling_parse.docling_parse import pdf_parser, pdf_parser_v2
8+
from docling_parse import pdf_parser, pdf_parser_v2
99

1010
try:
1111
from PIL import Image, ImageDraw

src/pybind/docling_parser_v2.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,7 @@ namespace docling
4343

4444
std::string pdf_resources_dir;
4545

46-
//std::map<std::string, std::filesystem::path> key2doc;
4746
std::map<std::string, decoder_ptr_type> key2doc;
48-
49-
//plib::parser parser;
5047
};
5148

5249
docling_parser_v2::docling_parser_v2():
@@ -82,7 +79,6 @@ namespace docling
8279
std::map<std::string, double> timings = {};
8380
pdflib::pdf_resource<pdflib::PAGE_FONT>::initialise(data, timings);
8481
}
85-
8682

8783
void docling_parser_v2::set_loglevel(int level)
8884
{
@@ -114,7 +110,7 @@ namespace docling
114110
{
115111
loguru::g_stderr_verbosity = loguru::Verbosity_INFO;
116112
}
117-
else if(level=="warning")
113+
else if(level=="warning" or level=="warn")
118114
{
119115
loguru::g_stderr_verbosity = loguru::Verbosity_WARNING;
120116
}

src/v2/pdf_decoders/document.h

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ namespace pdflib
3131

3232
private:
3333

34-
void update_timings(std::map<std::string, double>& timings_);
34+
void update_timings(std::map<std::string, double>& timings_, bool set_timer);
3535

3636
private:
3737

@@ -181,7 +181,10 @@ namespace pdflib
181181
utils::timer timer;
182182

183183
nlohmann::json& json_pages = json_document["pages"];
184-
184+
json_pages = nlohmann::json::array({});
185+
186+
bool set_timer=true;
187+
185188
int page_number=0;
186189
for(QPDFObjectHandle page : qpdf_document.getAllPages())
187190
{
@@ -190,7 +193,8 @@ namespace pdflib
190193
pdf_decoder<PAGE> page_decoder(page);
191194

192195
auto timings_ = page_decoder.decode_page();
193-
update_timings(timings_);
196+
update_timings(timings_, set_timer);
197+
set_timer = false;
194198

195199
json_pages.push_back(page_decoder.get());
196200

@@ -208,10 +212,13 @@ namespace pdflib
208212
LOG_S(INFO) << "start decoding selected pages ...";
209213
utils::timer timer;
210214

215+
// make sure that we only return the page from the page-numbers
211216
nlohmann::json& json_pages = json_document["pages"];
212-
217+
json_pages = nlohmann::json::array({});
218+
213219
std::vector<QPDFObjectHandle> pages = qpdf_document.getAllPages();
214-
220+
221+
bool set_timer=true; // make sure we override all timings for this page-set
215222
for(auto page_number:page_numbers)
216223
{
217224
utils::timer timer;
@@ -223,7 +230,9 @@ namespace pdflib
223230
pdf_decoder<PAGE> page_decoder(pages.at(page_number));
224231

225232
auto timings_ = page_decoder.decode_page();
226-
update_timings(timings_);
233+
234+
update_timings(timings_, set_timer);
235+
set_timer=false;
227236

228237
json_pages.push_back(page_decoder.get());
229238

@@ -244,11 +253,11 @@ namespace pdflib
244253
timings[__FUNCTION__] = timer.get_time();
245254
}
246255

247-
void pdf_decoder<DOCUMENT>::update_timings(std::map<std::string, double>& timings_)
256+
void pdf_decoder<DOCUMENT>::update_timings(std::map<std::string, double>& timings_, bool set_timer)
248257
{
249258
for(auto itr=timings_.begin(); itr!=timings_.end(); itr++)
250259
{
251-
if(timings.count(itr->first)==0)
260+
if(timings.count(itr->first)==0 or set_timer)
252261
{
253262
timings[itr->first] = itr->second;
254263
}

src/v2/pdf_resources/page_line.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,19 +82,22 @@ namespace pdflib
8282
std::pair<double, double> pdf_resource<PAGE_LINE>::front()
8383
{
8484
assert(x.size()>0);
85-
return std::pair<double, double>(x.front(), y.front());
85+
//return std::pair<double, double>(x.front(), y.front());
86+
return std::make_pair(x.front(), y.front());
8687
}
8788

8889
std::pair<double, double> pdf_resource<PAGE_LINE>::back()
8990
{
9091
assert(x.size()>0);
91-
return std::pair<double, double>(x.back(), y.back());
92+
//return std::pair<double, double>(x.back(), y.back());
93+
return std::make_pair(x.back(), y.back());
9294
}
9395

9496
std::pair<double, double> pdf_resource<PAGE_LINE>::operator[](int i)
9597
{
9698
assert(x.size()>0 and i<x.size());
97-
return std::pair<double, double>(x[i], y[i]);
99+
//return std::pair<double, double>(x[i], y[i]);
100+
return std::make_pair(x[i], y[i]);
98101
}
99102

100103
void pdf_resource<PAGE_LINE>::transform(std::array<double, 9> trafo_matrix)

tests/pdf_docs/tests/2305.14962v1.pdf.v2.bytesio.json

Lines changed: 63 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -16684,11 +16684,11 @@
1668416684
"lines": []
1668516685
},
1668616686
"timings": {
16687-
"decode_contents": 0.003507,
16688-
"decode_dimensions": 0.0,
16689-
"decode_page": 0.015862,
16690-
"decode_resources": 0.008806,
16691-
"sanitise_contents": 4.1e-05
16687+
"decode_contents": 0.003358,
16688+
"decode_dimensions": 4e-06,
16689+
"decode_page": 0.015178,
16690+
"decode_resources": 0.008326,
16691+
"sanitise_contents": 3.9e-05
1669216692
}
1669316693
},
1669416694
{
@@ -29861,11 +29861,11 @@
2986129861
]
2986229862
},
2986329863
"timings": {
29864-
"decode_contents": 0.015415,
29864+
"decode_contents": 0.015422,
2986529865
"decode_dimensions": 0.0,
29866-
"decode_page": 0.02518,
29867-
"decode_resources": 0.007428,
29868-
"sanitise_contents": 3.2e-05
29866+
"decode_page": 0.025149,
29867+
"decode_resources": 0.007392,
29868+
"sanitise_contents": 2.9e-05
2986929869
}
2987029870
},
2987129871
{
@@ -40840,10 +40840,10 @@
4084040840
]
4084140841
},
4084240842
"timings": {
40843-
"decode_contents": 0.002348,
40843+
"decode_contents": 0.00241,
4084440844
"decode_dimensions": 0.0,
40845-
"decode_page": 0.011055,
40846-
"decode_resources": 0.006615,
40845+
"decode_page": 0.011236,
40846+
"decode_resources": 0.006414,
4084740847
"sanitise_contents": 2.5e-05
4084840848
}
4084940849
},
@@ -54715,10 +54715,10 @@
5471554715
]
5471654716
},
5471754717
"timings": {
54718-
"decode_contents": 0.004488,
54718+
"decode_contents": 0.004505,
5471954719
"decode_dimensions": 0.0,
54720-
"decode_page": 0.012061,
54721-
"decode_resources": 0.005828,
54720+
"decode_page": 0.012243,
54721+
"decode_resources": 0.005641,
5472254722
"sanitise_contents": 2.8e-05
5472354723
}
5472454724
},
@@ -71744,11 +71744,11 @@
7174471744
]
7174571745
},
7174671746
"timings": {
71747-
"decode_contents": 0.002497,
71747+
"decode_contents": 0.002465,
7174871748
"decode_dimensions": 0.0,
71749-
"decode_page": 0.011164,
71750-
"decode_resources": 0.006387,
71751-
"sanitise_contents": 4.9e-05
71749+
"decode_page": 0.011008,
71750+
"decode_resources": 0.006174,
71751+
"sanitise_contents": 4.1e-05
7175271752
}
7175371753
},
7175471754
{
@@ -88941,11 +88941,11 @@
8894188941
]
8894288942
},
8894388943
"timings": {
88944-
"decode_contents": 0.004848,
88944+
"decode_contents": 0.004823,
8894588945
"decode_dimensions": 0.0,
88946-
"decode_page": 0.015907,
88947-
"decode_resources": 0.008227,
88948-
"sanitise_contents": 4e-05
88946+
"decode_page": 0.016107,
88947+
"decode_resources": 0.00802,
88948+
"sanitise_contents": 3.5e-05
8894988949
}
8895088950
},
8895188951
{
@@ -109738,11 +109738,11 @@
109738109738
]
109739109739
},
109740109740
"timings": {
109741-
"decode_contents": 0.010908,
109741+
"decode_contents": 0.01093,
109742109742
"decode_dimensions": 0.0,
109743-
"decode_page": 0.018141,
109744-
"decode_resources": 0.005595,
109745-
"sanitise_contents": 4e-05
109743+
"decode_page": 0.018393,
109744+
"decode_resources": 0.005545,
109745+
"sanitise_contents": 3.8e-05
109746109746
}
109747109747
},
109748109748
{
@@ -126017,11 +126017,11 @@
126017126017
"lines": []
126018126018
},
126019126019
"timings": {
126020-
"decode_contents": 0.002365,
126020+
"decode_contents": 0.00238,
126021126021
"decode_dimensions": 0.0,
126022-
"decode_page": 0.008046,
126023-
"decode_resources": 0.003733,
126024-
"sanitise_contents": 3.8e-05
126022+
"decode_page": 0.007619,
126023+
"decode_resources": 0.003613,
126024+
"sanitise_contents": 3.7e-05
126025126025
}
126026126026
},
126027126027
{
@@ -140088,11 +140088,11 @@
140088140088
"lines": []
140089140089
},
140090140090
"timings": {
140091-
"decode_contents": 0.002037,
140091+
"decode_contents": 0.002042,
140092140092
"decode_dimensions": 0.0,
140093-
"decode_page": 0.008592,
140094-
"decode_resources": 0.004611,
140095-
"sanitise_contents": 3.2e-05
140093+
"decode_page": 0.008342,
140094+
"decode_resources": 0.004467,
140095+
"sanitise_contents": 3.5e-05
140096140096
}
140097140097
},
140098140098
{
@@ -156805,10 +156805,10 @@
156805156805
]
156806156806
},
156807156807
"timings": {
156808-
"decode_contents": 0.002362,
156808+
"decode_contents": 0.002402,
156809156809
"decode_dimensions": 0.0,
156810-
"decode_page": 0.010877,
156811-
"decode_resources": 0.006402,
156810+
"decode_page": 0.010979,
156811+
"decode_resources": 0.006272,
156812156812
"sanitise_contents": 3.8e-05
156813156813
}
156814156814
},
@@ -176492,10 +176492,10 @@
176492176492
"lines": []
176493176493
},
176494176494
"timings": {
176495-
"decode_contents": 0.003171,
176495+
"decode_contents": 0.003227,
176496176496
"decode_dimensions": 0.0,
176497-
"decode_page": 0.008376,
176498-
"decode_resources": 0.002849,
176497+
"decode_page": 0.007453,
176498+
"decode_resources": 0.002728,
176499176499
"sanitise_contents": 4.7e-05
176500176500
}
176501176501
},
@@ -186267,33 +186267,33 @@
186267186267
"lines": []
186268186268
},
186269186269
"timings": {
186270-
"decode_contents": 0.001524,
186270+
"decode_contents": 0.001489,
186271186271
"decode_dimensions": 0.0,
186272-
"decode_page": 0.004688,
186273-
"decode_resources": 0.001911,
186272+
"decode_page": 0.004278,
186273+
"decode_resources": 0.001836,
186274186274
"sanitise_contents": 2.2e-05
186275186275
}
186276186276
}
186277186277
],
186278186278
"timings": {
186279-
"decode_contents": 0.05547,
186280-
"decode_dimensions": 0.0,
186281-
"decode_document": 0.154582,
186282-
"decode_page": 0.149949,
186283-
"decode_resources": 0.068392,
186284-
"decoding page 0": 0.016121,
186285-
"decoding page 1": 0.025364,
186286-
"decoding page 10": 0.008664,
186287-
"decoding page 11": 0.004831,
186288-
"decoding page 2": 0.011215,
186289-
"decoding page 3": 0.01223,
186290-
"decoding page 4": 0.011415,
186291-
"decoding page 5": 0.016156,
186292-
"decoding page 6": 0.018376,
186293-
"decoding page 7": 0.008278,
186294-
"decoding page 8": 0.008797,
186295-
"decoding page 9": 0.011129,
186296-
"process_document_from_bytesio": 0.000393,
186297-
"sanitise_contents": 0.00043200000000000004
186279+
"decode_contents": 0.055453,
186280+
"decode_dimensions": 4e-06,
186281+
"decode_document": 0.152719,
186282+
"decode_page": 0.14798499999999998,
186283+
"decode_resources": 0.066428,
186284+
"decoding page 0": 0.015425,
186285+
"decoding page 1": 0.025324,
186286+
"decoding page 10": 0.007762,
186287+
"decoding page 11": 0.004436,
186288+
"decoding page 2": 0.011409,
186289+
"decoding page 3": 0.012426,
186290+
"decoding page 4": 0.011271,
186291+
"decoding page 5": 0.016368,
186292+
"decoding page 6": 0.018673,
186293+
"decoding page 7": 0.007881,
186294+
"decoding page 8": 0.008561,
186295+
"decoding page 9": 0.011247,
186296+
"process_document_from_bytesio": 0.000421,
186297+
"sanitise_contents": 0.000414
186298186298
}
186299186299
}

0 commit comments

Comments
 (0)