Skip to content

Commit 4f3d59b

Browse files
updated the tests with new ground-truth data to represent the PdfShape, PdfWidget and PdfHyperlink
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
1 parent 82cd64d commit 4f3d59b

File tree

184 files changed

+507128
-174683
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

184 files changed

+507128
-174683
lines changed

app/pybind_parse.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
143143
.def_readonly("x1", &pdflib::page_item<pdflib::PAGE_WIDGET>::x1)
144144
.def_readonly("y1", &pdflib::page_item<pdflib::PAGE_WIDGET>::y1)
145145
.def_readonly("text", &pdflib::page_item<pdflib::PAGE_WIDGET>::text)
146+
.def_readonly("description", &pdflib::page_item<pdflib::PAGE_WIDGET>::description)
146147
.def_readonly("field_name", &pdflib::page_item<pdflib::PAGE_WIDGET>::field_name)
147148
.def_readonly("field_type", &pdflib::page_item<pdflib::PAGE_WIDGET>::field_type);
148149

docling_parse/pdf_parser.py

Lines changed: 57 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@
1313
BoundingRectangle,
1414
ColorRGBA,
1515
Coord2D,
16+
PdfHyperlink,
1617
PdfMetaData,
1718
PdfPageBoundaryType,
1819
PdfPageGeometry,
1920
PdfShape,
2021
PdfTableOfContents,
2122
PdfTextCell,
23+
PdfWidget,
2224
SegmentedPdfPage,
2325
TextCell,
2426
TextDirection,
@@ -425,13 +427,16 @@ def _to_page_geometry_from_decoder(self, page_dim) -> PdfPageGeometry:
425427
coord_origin=CoordOrigin.BOTTOMLEFT,
426428
)
427429
art_bbox_obj = BoundingBox(
428-
l=crop_bbox[0], b=crop_bbox[1], r=crop_bbox[2], t=crop_bbox[3]
430+
l=crop_bbox[0], b=crop_bbox[1], r=crop_bbox[2], t=crop_bbox[3],
431+
coord_origin=CoordOrigin.BOTTOMLEFT,
429432
)
430433
media_bbox_obj = BoundingBox(
431-
l=media_bbox[0], b=media_bbox[1], r=media_bbox[2], t=media_bbox[3]
434+
l=media_bbox[0], b=media_bbox[1], r=media_bbox[2], t=media_bbox[3],
435+
coord_origin=CoordOrigin.BOTTOMLEFT,
432436
)
433437
crop_bbox_obj = BoundingBox(
434-
l=crop_bbox[0], b=crop_bbox[1], r=crop_bbox[2], t=crop_bbox[3]
438+
l=crop_bbox[0], b=crop_bbox[1], r=crop_bbox[2], t=crop_bbox[3],
439+
coord_origin=CoordOrigin.BOTTOMLEFT,
435440
)
436441

437442
return PdfPageGeometry(
@@ -531,53 +536,58 @@ def _to_shapes_from_decoder(self, shapes_container) -> List[PdfShape]:
531536

532537
return result
533538

534-
def _to_shapes_from_widgets(self, widgets_container) -> List[PdfShape]:
535-
"""Convert typed PdfWidgets container to list of PdfShape (rectangle per widget)."""
536-
result: List[PdfShape] = []
539+
def _to_widgets_from_decoder(self, widgets_container) -> List[PdfWidget]:
540+
"""Convert typed PdfWidgets container to list of PdfWidget objects."""
541+
result: List[PdfWidget] = []
537542

538543
for ind, widget in enumerate(widgets_container):
539-
points = [
540-
Coord2D(widget.x0, widget.y0),
541-
Coord2D(widget.x1, widget.y0),
542-
Coord2D(widget.x1, widget.y1),
543-
Coord2D(widget.x0, widget.y1),
544-
Coord2D(widget.x0, widget.y0), # close the rectangle
545-
]
546-
pdf_shape = PdfShape(
547-
index=ind,
548-
parent_id=0,
549-
points=points,
550-
has_graphics_state=True,
551-
line_width=1.0,
552-
rgb_stroking=ColorRGBA(r=255, g=165, b=0, a=255), # orange
553-
rgb_filling=ColorRGBA(r=255, g=165, b=0, a=64), # orange, translucent
544+
rect = BoundingRectangle(
545+
r_x0=widget.x0,
546+
r_y0=widget.y0,
547+
r_x1=widget.x1,
548+
r_y1=widget.y0,
549+
r_x2=widget.x1,
550+
r_y2=widget.y1,
551+
r_x3=widget.x0,
552+
r_y3=widget.y1,
553+
)
554+
result.append(
555+
PdfWidget(
556+
index=ind,
557+
rect=rect,
558+
widget_text=widget.text or None,
559+
widget_description=widget.description or None,
560+
widget_field_name=widget.field_name or None,
561+
widget_field_type=widget.field_type or None,
562+
)
554563
)
555-
result.append(pdf_shape)
556564

557565
return result
558566

559-
def _to_shapes_from_hyperlinks(self, hyperlinks_container) -> List[PdfShape]:
560-
"""Convert typed PdfHyperlinks container to list of PdfShape (rectangle per hyperlink)."""
561-
result: List[PdfShape] = []
567+
def _to_hyperlinks_from_decoder(
568+
self, hyperlinks_container
569+
) -> List[PdfHyperlink]:
570+
"""Convert typed PdfHyperlinks container to list of PdfHyperlink objects."""
571+
result: List[PdfHyperlink] = []
562572

563573
for ind, hyperlink in enumerate(hyperlinks_container):
564-
points = [
565-
Coord2D(hyperlink.x0, hyperlink.y0),
566-
Coord2D(hyperlink.x1, hyperlink.y0),
567-
Coord2D(hyperlink.x1, hyperlink.y1),
568-
Coord2D(hyperlink.x0, hyperlink.y1),
569-
Coord2D(hyperlink.x0, hyperlink.y0), # close the rectangle
570-
]
571-
pdf_shape = PdfShape(
572-
index=ind,
573-
parent_id=0,
574-
points=points,
575-
has_graphics_state=True,
576-
line_width=1.0,
577-
rgb_stroking=ColorRGBA(r=0, g=0, b=255, a=255), # blue
578-
rgb_filling=ColorRGBA(r=0, g=0, b=255, a=64), # blue, translucent
574+
rect = BoundingRectangle(
575+
r_x0=hyperlink.x0,
576+
r_y0=hyperlink.y0,
577+
r_x1=hyperlink.x1,
578+
r_y1=hyperlink.y0,
579+
r_x2=hyperlink.x1,
580+
r_y2=hyperlink.y1,
581+
r_x3=hyperlink.x0,
582+
r_y3=hyperlink.y1,
583+
)
584+
result.append(
585+
PdfHyperlink(
586+
index=ind,
587+
rect=rect,
588+
uri=hyperlink.uri or None,
589+
)
579590
)
580-
result.append(pdf_shape)
581591

582592
return result
583593

@@ -660,8 +670,10 @@ def _to_segmented_page_from_decoder(
660670

661671
char_cells = self._to_cells_from_decoder(page_decoder.get_char_cells())
662672
shapes = self._to_shapes_from_decoder(page_decoder.get_page_shapes())
663-
shapes += self._to_shapes_from_widgets(page_decoder.get_page_widgets())
664-
shapes += self._to_shapes_from_hyperlinks(page_decoder.get_page_hyperlinks())
673+
widgets = self._to_widgets_from_decoder(page_decoder.get_page_widgets())
674+
hyperlinks = self._to_hyperlinks_from_decoder(
675+
page_decoder.get_page_hyperlinks()
676+
)
665677
bitmap_resources = self._to_bitmap_resources_from_decoder(
666678
page_decoder.get_page_images()
667679
)
@@ -676,6 +688,8 @@ def _to_segmented_page_from_decoder(
676688
has_chars=len(char_cells) > 0,
677689
bitmap_resources=bitmap_resources,
678690
shapes=shapes,
691+
widgets=widgets,
692+
hyperlinks=hyperlinks,
679693
)
680694

681695
if page_decoder.has_word_cells():

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ dependencies = [
3232
"tabulate>=0.9.0,<1.0.0",
3333
"pillow>=10.0.0,<13.0.0",
3434
"pydantic>=2.0.0",
35-
"docling-core>=2.64.0",
36-
# "docling-core @ git+https://github.com/docling-project/docling-core.git@e4267bfc45fd196e4db4207513440a94a5fd0386",
35+
"docling-core>=2.65.1",
36+
# "docling-core @ git+https://github.com/docling-project/docling-core.git@720a14846ac6fe9f53031305af04d50b6d8479e4",
3737
"pywin32>=305; sys_platform == 'win32'",
3838
]
3939
[project.urls]

src/parse/page_items/page_widget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,15 @@ namespace pdflib
2828

2929
// Widget-specific fields
3030
std::string text;
31+
std::string description;
3132
std::string field_name;
3233
std::string field_type;
3334
};
3435

3536
page_item<PAGE_WIDGET>::page_item():
3637
x0(0), y0(0), x1(0), y1(0),
3738
text(),
39+
description(),
3840
field_name(),
3941
field_type()
4042
{}
@@ -52,6 +54,7 @@ namespace pdflib
5254
result["y1"] = utils::values::round(y1);
5355

5456
result["text"] = text;
57+
result["description"] = description;
5558
result["field_name"] = field_name;
5659
result["field_type"] = field_type;
5760
}

tests/data/groundtruth/broken_media_box_v01.pdf.page_no_1.py.json

Lines changed: 1618 additions & 607 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)