Skip to content

Commit 688789e

Browse files
cau-gitPeterStaar-IBMceberamvagenas
authored
feat: (experimental) introduce new document format (#21)
* Draft new docling document format, pydantic model and tests Signed-off-by: Christoph Auer <[email protected]> * Fix tests to have unique document_hashes per test Signed-off-by: Christoph Auer <[email protected]> * Manual update from main Signed-off-by: Christoph Auer <[email protected]> * Move new-format to experimental path Signed-off-by: Christoph Auer <[email protected]> * Updates for document construction API and format Signed-off-by: Christoph Auer <[email protected]> * Add comments Signed-off-by: Christoph Auer <[email protected]> * Add BaseTableData and table cell typing Signed-off-by: Christoph Auer <[email protected]> * Tree element iterator, several API fixes Signed-off-by: Christoph Auer <[email protected]> * Turn captions into list field Signed-off-by: Christoph Auer <[email protected]> * Add export methods to DoclingDocument and types Signed-off-by: Christoph Auer <[email protected]> * Change DoclingDocument.iterate_elements and add print tree function Signed-off-by: Christoph Auer <[email protected]> * Introduce label enum types, apply everywhere Signed-off-by: Christoph Auer <[email protected]> * Introduce provenance info, use enum labels Signed-off-by: Christoph Auer <[email protected]> * Update formatting Signed-off-by: Christoph Auer <[email protected]> * Docstrings and linter fixes Signed-off-by: Christoph Auer <[email protected]> * Lockfile rollback, since updating breaks tests Signed-off-by: Christoph Auer <[email protected]> * Cleanup Signed-off-by: Christoph Auer <[email protected]> * Several improvements and cleanup Signed-off-by: Christoph Auer <[email protected]> * Format fixes Signed-off-by: Christoph Auer <[email protected]> * Big redesign for usage of hashes, several other fixes Signed-off-by: Christoph Auer <[email protected]> * Fix flake8 config Signed-off-by: Christoph Auer <[email protected]> * Remove hash, renamings Signed-off-by: Christoph Auer <[email protected]> * updating the tests Signed-off-by: Peter Staar <[email protected]> * added some unit tests for DocItem Signed-off-by: Peter Staar <[email protected]> * Add tree validation API and test case Signed-off-by: Christoph Auer <[email protected]> * Add extra=Forbid to NodeItem Signed-off-by: Christoph Auer <[email protected]> * feat: set DoclingDocument version as SemanticVersion with default Signed-off-by: Cesar Berrospi Ramis <[email protected]> * Cleanup Signed-off-by: Christoph Auer <[email protected]> * Simpler literal enforcement Signed-off-by: Christoph Auer <[email protected]> * Fix static document version Signed-off-by: Christoph Auer <[email protected]> * Rollback changes to allow for semver<3 Signed-off-by: Christoph Auer <[email protected]> * build: rollback changes to include python-semantic-release as dev dependency Signed-off-by: Cesar Berrospi Ramis <[email protected]> * feat: set version field as string with pattern and check compatibility Signed-off-by: Cesar Berrospi Ramis <[email protected]> * add JSON Pointer validation to refs, fix test data Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Cesar Berrospi Ramis <[email protected]> Signed-off-by: Panos Vagenas <[email protected]> Co-authored-by: Peter Staar <[email protected]> Co-authored-by: Cesar Berrospi Ramis <[email protected]> Co-authored-by: Panos Vagenas <[email protected]>
1 parent b76780c commit 688789e

File tree

17 files changed

+15508
-11
lines changed

17 files changed

+15508
-11
lines changed

.flake8

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
[flake8]
2+
per-file-ignores = __init__.py:F401
23
max-line-length = 88
34
exclude = test/*
45
max-complexity = 18
56
docstring-convention = google
67
ignore = W503,E203
7-
classmethod-decorators = classmethod,validator
8+
classmethod-decorators = classmethod,validator

docling_core/types/doc/base.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,6 @@ def export_to_document_tokens(
440440
):
441441
"""Export text element to document tokens format."""
442442
body = f"<{self.obj_type}>"
443-
# body = f"<{self.name}>"
444443

445444
assert DocumentToken.is_known_token(
446445
body
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#
2+
# Copyright IBM Corp. 2024 - 2024
3+
# SPDX-License-Identifier: MIT
4+
#
5+
6+
"""Package for models defined by the Document type."""
7+
8+
from .base import BoundingBox, CoordOrigin, Size
9+
from .document import (
10+
BasePictureData,
11+
BaseTableData,
12+
DescriptionItem,
13+
DocItem,
14+
DoclingDocument,
15+
DocumentOrigin,
16+
FloatingItem,
17+
GroupItem,
18+
ImageRef,
19+
KeyValueItem,
20+
NodeItem,
21+
PageItem,
22+
PictureItem,
23+
ProvenanceItem,
24+
RefItem,
25+
SectionHeaderItem,
26+
TableCell,
27+
TableItem,
28+
TextItem,
29+
)
30+
from .labels import DocItemLabel, GroupLabel, TableCellLabel
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
"""Models for the base data types."""
2+
3+
import copy
4+
from enum import Enum
5+
from typing import Tuple
6+
7+
from pydantic import BaseModel
8+
9+
10+
class CoordOrigin(str, Enum):
11+
"""CoordOrigin."""
12+
13+
TOPLEFT = "TOPLEFT"
14+
BOTTOMLEFT = "BOTTOMLEFT"
15+
16+
17+
class Size(BaseModel):
18+
"""Size."""
19+
20+
width: float = 0.0
21+
height: float = 0.0
22+
23+
def as_tuple(self):
24+
"""as_tuple."""
25+
return (self.width, self.height)
26+
27+
28+
class BoundingBox(BaseModel):
29+
"""BoundingBox."""
30+
31+
l: float # left
32+
t: float # top
33+
r: float # right
34+
b: float # bottom
35+
36+
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
37+
38+
@property
39+
def width(self):
40+
"""width."""
41+
return self.r - self.l
42+
43+
@property
44+
def height(self):
45+
"""height."""
46+
return abs(self.t - self.b)
47+
48+
def scaled(self, scale: float) -> "BoundingBox":
49+
"""scaled.
50+
51+
:param scale: float:
52+
53+
"""
54+
out_bbox = copy.deepcopy(self)
55+
out_bbox.l *= scale
56+
out_bbox.r *= scale
57+
out_bbox.t *= scale
58+
out_bbox.b *= scale
59+
60+
return out_bbox
61+
62+
def normalized(self, page_size: Size) -> "BoundingBox":
63+
"""normalized.
64+
65+
:param page_size: Size:
66+
67+
"""
68+
out_bbox = copy.deepcopy(self)
69+
out_bbox.l /= page_size.width
70+
out_bbox.r /= page_size.width
71+
out_bbox.t /= page_size.height
72+
out_bbox.b /= page_size.height
73+
74+
return out_bbox
75+
76+
def as_tuple(self):
77+
"""as_tuple."""
78+
if self.coord_origin == CoordOrigin.TOPLEFT:
79+
return (self.l, self.t, self.r, self.b)
80+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
81+
return (self.l, self.b, self.r, self.t)
82+
83+
@classmethod
84+
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
85+
"""from_tuple.
86+
87+
:param coord: Tuple[float:
88+
:param ...]:
89+
:param origin: CoordOrigin:
90+
91+
"""
92+
if origin == CoordOrigin.TOPLEFT:
93+
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
94+
if r < l:
95+
l, r = r, l
96+
if b < t:
97+
b, t = t, b
98+
99+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
100+
elif origin == CoordOrigin.BOTTOMLEFT:
101+
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
102+
if r < l:
103+
l, r = r, l
104+
if b > t:
105+
b, t = t, b
106+
107+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
108+
109+
def area(self) -> float:
110+
"""area."""
111+
return (self.r - self.l) * (self.b - self.t)
112+
113+
def intersection_area_with(self, other: "BoundingBox") -> float:
114+
"""intersection_area_with.
115+
116+
:param other: "BoundingBox":
117+
118+
"""
119+
# Calculate intersection coordinates
120+
left = max(self.l, other.l)
121+
top = max(self.t, other.t)
122+
right = min(self.r, other.r)
123+
bottom = min(self.b, other.b)
124+
125+
# Calculate intersection dimensions
126+
width = right - left
127+
height = bottom - top
128+
129+
# If the bounding boxes do not overlap, width or height will be negative
130+
if width <= 0 or height <= 0:
131+
return 0.0
132+
133+
return width * height
134+
135+
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
136+
"""to_bottom_left_origin.
137+
138+
:param page_height:
139+
140+
"""
141+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
142+
return self
143+
elif self.coord_origin == CoordOrigin.TOPLEFT:
144+
return BoundingBox(
145+
l=self.l,
146+
r=self.r,
147+
t=page_height - self.t,
148+
b=page_height - self.b,
149+
coord_origin=CoordOrigin.BOTTOMLEFT,
150+
)
151+
152+
def to_top_left_origin(self, page_height):
153+
"""to_top_left_origin.
154+
155+
:param page_height:
156+
157+
"""
158+
if self.coord_origin == CoordOrigin.TOPLEFT:
159+
return self
160+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
161+
return BoundingBox(
162+
l=self.l,
163+
r=self.r,
164+
t=page_height - self.t, # self.b
165+
b=page_height - self.b, # self.t
166+
coord_origin=CoordOrigin.TOPLEFT,
167+
)

0 commit comments

Comments
 (0)