Skip to content

Commit 22cf280

Browse files
feat: add the export of annotations and ToC (#58)
Signed-off-by: Peter Staar <[email protected]>
1 parent b3a33a2 commit 22cf280

File tree

199 files changed

+4850762
-120856
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

199 files changed

+4850762
-120856
lines changed

app/pybind_parse.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,12 @@ PYBIND11_MODULE(docling_parse, m) {
8585
.def("unload_document", &docling::docling_parser_v2::unload_document)
8686

8787
.def("number_of_pages", &docling::docling_parser_v2::number_of_pages)
88+
89+
.def("get_annotations", &docling::docling_parser_v2::get_annotations,
90+
"Get annotations at the top-level of the document")
91+
92+
.def("get_table_of_contents", &docling::docling_parser_v2::get_table_of_contents,
93+
"Get the table-of-contents (None if not available)")
8894

8995
.def("parse_pdf_from_key",
9096
pybind11::overload_cast<std::string>(&docling::docling_parser_v2::parse_pdf_from_key),

docling_parse/visualize.py

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,24 @@ def visualise_v1(
212212
img.save(oname)
213213

214214

215+
def draw_annotations(draw, annot, H, W):
216+
217+
if "/Rect" in annot:
218+
bbox = annot["/Rect"]
219+
220+
bl = (bbox[0], H - bbox[1])
221+
br = (bbox[2], H - bbox[1])
222+
tr = (bbox[2], H - bbox[3])
223+
tl = (bbox[0], H - bbox[3])
224+
225+
# Draw the rectangle as a polygon
226+
draw.polygon([bl, br, tr, tl], outline="white", fill="green")
227+
228+
if "/Kids" in annot:
229+
for _ in annot["/Kids"]:
230+
draw_annotations(draw, annot, H, W)
231+
232+
215233
def visualise_v2(
216234
log_level: str,
217235
pdf_path: str,
@@ -260,6 +278,8 @@ def visualise_v2(
260278

261279
lines = page[_]["lines"]
262280

281+
annots = page["annotations"]
282+
263283
if PIL_INSTALLED:
264284

265285
W = dimension["width"]
@@ -286,24 +306,6 @@ def visualise_v2(
286306
# Draw the rectangle as a polygon
287307
draw.polygon([bl, br, tr, tl], outline="green", fill="yellow")
288308

289-
# Draw each rectangle by connecting its four points
290-
for line in lines:
291-
292-
i = line["i"]
293-
x = line["x"]
294-
y = line["y"]
295-
296-
for l in range(0, len(i), 2):
297-
i0 = i[l + 0]
298-
i1 = i[l + 1]
299-
300-
for k in range(i0, i1 - 1):
301-
draw.line(
302-
(x[k], H - y[k], x[k + 1], H - y[k + 1]),
303-
fill="black",
304-
width=3,
305-
)
306-
307309
# Draw each rectangle by connecting its four points
308310
for row in cells:
309311

@@ -330,6 +332,28 @@ def visualise_v2(
330332
# You can change the outline and fill color
331333
draw.polygon(rect, outline="red", fill="blue")
332334

335+
# Draw widgets
336+
for annot in annots:
337+
draw_annotations(draw, annot, H, W)
338+
339+
# Draw each rectangle by connecting its four points
340+
for line in lines:
341+
342+
i = line["i"]
343+
x = line["x"]
344+
y = line["y"]
345+
346+
for l in range(0, len(i), 2):
347+
i0 = i[l + 0]
348+
i1 = i[l + 1]
349+
350+
for k in range(i0, i1 - 1):
351+
draw.line(
352+
(x[k], H - y[k], x[k + 1], H - y[k + 1]),
353+
fill="black",
354+
width=1,
355+
)
356+
333357
# Show the image
334358
if interactive:
335359
img.show()

src/pybind/docling_parser_v2.h

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ namespace docling
3535

3636
int number_of_pages(std::string key);
3737

38+
nlohmann::json get_annotations(std::string key);
39+
nlohmann::json get_table_of_contents(std::string key);
40+
3841
nlohmann::json parse_pdf_from_key(std::string key);
3942

4043
nlohmann::json parse_pdf_from_key_on_page(std::string key, int page);
@@ -230,10 +233,40 @@ namespace docling
230233

231234
return -1;
232235
}
236+
237+
nlohmann::json docling_parser_v2::get_annotations(std::string key)
238+
{
239+
LOG_S(INFO) << __FUNCTION__;
240+
241+
auto itr = key2doc.find(key);
242+
243+
if(itr==key2doc.end())
244+
{
245+
LOG_S(ERROR) << "key not found: " << key;
246+
return nlohmann::json::value_t::null;
247+
}
248+
249+
return (itr->second)->get_annotations();
250+
}
251+
252+
nlohmann::json docling_parser_v2::get_table_of_contents(std::string key)
253+
{
254+
LOG_S(INFO) << __FUNCTION__;
255+
256+
auto itr = key2doc.find(key);
257+
258+
if(itr==key2doc.end())
259+
{
260+
LOG_S(ERROR) << "key not found: " << key;
261+
return nlohmann::json::value_t::null;
262+
}
263+
264+
return (itr->second)->get_table_of_contents();
265+
}
233266

234267
nlohmann::json docling_parser_v2::parse_pdf_from_key(std::string key)
235268
{
236-
LOG_S(WARNING) << __FUNCTION__;
269+
LOG_S(INFO) << __FUNCTION__;
237270

238271
auto itr = key2doc.find(key);
239272

src/v2/parser.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ namespace plib
2424
bool initialise(nlohmann::json& data);
2525

2626
private:
27-
28-
2927

3028
void execute_parse();
3129

src/v2/pdf_decoders/document.h

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ namespace pdflib
2121
nlohmann::json get();
2222

2323
int get_number_of_pages() { return number_of_pages; }
24+
25+
nlohmann::json get_annotations() { return json_annots; }
26+
nlohmann::json get_table_of_contents() { return json_annots["table_of_contents"]; }
2427

2528
bool process_document_from_file(std::string& _filename);
2629
bool process_document_from_bytesio(std::string& _buffer);
@@ -47,6 +50,8 @@ namespace pdflib
4750

4851
int number_of_pages;
4952

53+
//nlohmann::json json_toc; // table-of-contents
54+
nlohmann::json json_annots;
5055
nlohmann::json json_document;
5156
};
5257

@@ -56,15 +61,15 @@ namespace pdflib
5661

5762
timings({}),
5863
qpdf_document(),
59-
60-
//qpdf_root(NULL),
61-
//qpdf_pages(NULL),
6264

6365
// have compatibulity between QPDF v10 and v11
6466
qpdf_root(),
6567
qpdf_pages(),
6668

6769
number_of_pages(-1),
70+
71+
//json_toc(nlohmann::json::value_t::null),
72+
json_annots(nlohmann::json::value_t::null),
6873
json_document(nlohmann::json::value_t::null)
6974
{}
7075

@@ -75,14 +80,14 @@ namespace pdflib
7580
timings(timings_),
7681
qpdf_document(),
7782

78-
//qpdf_root(NULL),
79-
//qpdf_pages(NULL),
80-
8183
// have compatibulity between QPDF v10 and v11
8284
qpdf_root(),
8385
qpdf_pages(),
8486

8587
number_of_pages(-1),
88+
89+
//json_toc(nlohmann::json::value_t::null),
90+
json_annots(nlohmann::json::value_t::null),
8691
json_document(nlohmann::json::value_t::null)
8792
{}
8893

@@ -91,6 +96,11 @@ namespace pdflib
9196

9297
nlohmann::json pdf_decoder<DOCUMENT>::get()
9398
{
99+
{
100+
//json_document["table_of_contents"] = json_toc;
101+
json_document["annotations"] = json_annots;
102+
}
103+
94104
{
95105
nlohmann::json& timings_ = json_document["timings"];
96106

@@ -118,6 +128,9 @@ namespace pdflib
118128
qpdf_root = qpdf_document.getRoot();
119129
qpdf_pages = qpdf_root.getKey("/Pages");
120130

131+
//json_toc = extract_toc_in_json(qpdf_root);
132+
json_annots = extract_document_annotations_in_json(qpdf_document, qpdf_root);
133+
121134
number_of_pages = qpdf_pages.getKey("/Count").getIntValue();
122135
LOG_S(INFO) << "#-pages: " << number_of_pages;
123136

@@ -148,13 +161,17 @@ namespace pdflib
148161
try
149162
{
150163
std::string description = "processing buffer";
151-
qpdf_document.processMemoryFile(description.c_str(), buffer.c_str(), buffer.size());
164+
qpdf_document.processMemoryFile(description.c_str(),
165+
buffer.c_str(), buffer.size());
152166

153167
LOG_S(INFO) << "buffer processed by qpdf!";
154168

155169
qpdf_root = qpdf_document.getRoot();
156170
qpdf_pages = qpdf_root.getKey("/Pages");
157171

172+
//json_toc = extract_toc_in_json(qpdf_root);
173+
json_annots = extract_document_annotations_in_json(qpdf_document, qpdf_root);
174+
158175
number_of_pages = qpdf_pages.getKey("/Count").getIntValue();
159176
LOG_S(INFO) << "#-pages: " << number_of_pages;
160177

@@ -253,7 +270,8 @@ namespace pdflib
253270
timings[__FUNCTION__] = timer.get_time();
254271
}
255272

256-
void pdf_decoder<DOCUMENT>::update_timings(std::map<std::string, double>& timings_, bool set_timer)
273+
void pdf_decoder<DOCUMENT>::update_timings(std::map<std::string, double>& timings_,
274+
bool set_timer)
257275
{
258276
for(auto itr=timings_.begin(); itr!=timings_.end(); itr++)
259277
{

src/v2/pdf_decoders/page.h

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ namespace pdflib
3939
// Contents
4040
void decode_contents();
4141

42+
void decode_annots();
43+
4244
void sanitise_contents();
4345

4446
private:
@@ -49,6 +51,8 @@ namespace pdflib
4951
QPDFObjectHandle qpdf_fonts;
5052
QPDFObjectHandle qpdf_xobjects;
5153

54+
nlohmann::json json_annots;
55+
5256
nlohmann::json json_page;
5357
nlohmann::json json_resources;
5458
nlohmann::json json_grphs;
@@ -85,6 +89,8 @@ namespace pdflib
8589
{
8690
nlohmann::json result;
8791
{
92+
result["annotations"] = json_annots;
93+
8894
nlohmann::json& timings_ = result["timings"];
8995
{
9096
for(auto itr=timings.begin(); itr!=timings.end(); itr++)
@@ -126,6 +132,8 @@ namespace pdflib
126132
utils::timer timer;
127133

128134
json_page = to_json(qpdf_page);
135+
136+
json_annots = extract_annots_in_json(qpdf_page);
129137

130138
try
131139
{
@@ -153,8 +161,10 @@ namespace pdflib
153161

154162
decode_contents();
155163

164+
decode_annots();
165+
156166
sanitise_contents();
157-
167+
158168
timings[__FUNCTION__] = timer.get_time();
159169

160170
return timings;
@@ -264,6 +274,82 @@ namespace pdflib
264274
}
265275
}
266276

277+
timings[__FUNCTION__] = timer.get_time();
278+
}
279+
280+
void pdf_decoder<PAGE>::decode_annots()
281+
{
282+
LOG_S(INFO) << __FUNCTION__;
283+
utils::timer timer;
284+
285+
//LOG_S(INFO) << "analyzing: " << json_annots.dump(2);
286+
if(json_annots.is_array())
287+
{
288+
for(auto item:json_annots)
289+
{
290+
LOG_S(INFO) << "analyzing: " << item.dump(2);
291+
292+
if(item.count("/Type")==1 and item["/Type"].get<std::string>()=="/Annot" and
293+
item.count("/Subtype")==1 and item["/Subtype"].get<std::string>()=="/Widget" and
294+
item.count("/Rect")==1 and
295+
item.count("/V")==1 and
296+
item.count("/T")==1 and true)
297+
{
298+
std::array<double, 4> bbox = item["/Rect"].get<std::array<double, 4> >();
299+
//LOG_S(INFO) << bbox[0] << ", "<< bbox[1] << ", "<< bbox[2] << ", "<< bbox[3];
300+
301+
std::string text = item["/V"].get<std::string>();
302+
//LOG_S(INFO) << "text: " << text;
303+
304+
pdf_resource<PAGE_CELL> cell;
305+
{
306+
cell.widget = true;
307+
308+
cell.x0 = bbox[0];
309+
cell.y0 = bbox[1];
310+
cell.x1 = bbox[2];
311+
cell.y1 = bbox[3];
312+
313+
cell.r_x0 = bbox[0];
314+
cell.r_y0 = bbox[1];
315+
cell.r_x1 = bbox[2];
316+
cell.r_y1 = bbox[1];
317+
cell.r_x2 = bbox[2];
318+
cell.r_y2 = bbox[3];
319+
cell.r_x3 = bbox[0];
320+
cell.r_y3 = bbox[3];
321+
322+
cell.text = text;
323+
cell.rendering_mode = 0;
324+
325+
cell.space_width = 0;
326+
cell.chars = {};//chars;
327+
cell.widths = {};//widths;
328+
329+
cell.enc_name = "Form-font"; //font.get_encoding_name();
330+
331+
cell.font_enc = "Form-font"; //to_string(font.get_encoding());
332+
cell.font_key = "Form-font"; //font.get_key();
333+
334+
cell.font_name = "Form-font"; //font.get_name();
335+
cell.font_size = 0; //font_size/1000.0;
336+
337+
cell.italic = false;
338+
cell.bold = false;
339+
340+
cell.ocr = false;
341+
cell.confidence = -1.0;
342+
343+
cell.stack_size = -1;
344+
cell.block_count = -1;
345+
cell.instr_count = -1;
346+
}
347+
348+
page_cells.push_back(cell);
349+
}
350+
}
351+
}
352+
267353
timings[__FUNCTION__] = timer.get_time();
268354
}
269355

0 commit comments

Comments
 (0)