Skip to content

Commit 24ebd0f

Browse files
authored
chore: Move coordinate details from Element model to a metadata model (#827)
1 parent 6ec177e commit 24ebd0f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+511
-243
lines changed

CHANGELOG.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
1-
## 0.7.13-dev0
1+
## 0.8.0-dev0
22

33
### Enhancements
44

55
### Features
66

77
### Fixes
88

9+
### BREAKING CHANGES
10+
11+
* Information about an element's location is no longer returned as top-level attributes of an element. Instead, it is returned in the `coordinates` attribute of the element's metadata.
12+
913
## 0.7.12
1014

1115
### Enhancements
@@ -28,7 +32,7 @@
2832

2933
* More deterministic element ordering when using `hi_res` PDF parsing strategy (from unstructured-inference bump to 0.5.4)
3034
* Make large model available (from unstructured-inference bump to 0.5.3)
31-
* Combine inferred elements with extracted elements (from unstructured-inference bump to 0.5.2)
35+
* Combine inferred elements with extracted elements (from unstructured-inference bump to 0.5.2)
3236
* `partition_email` and `partition_msg` will now process attachments if `process_attachments=True`
3337
and a attachment partitioning functions is passed through with `attachment_partitioner=partition`.
3438

docs/source/bricks.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1531,7 +1531,6 @@ The output will look like:
15311531
{
15321532
"data": {
15331533
"element_id": "ad270eefd1cc68d15f4d3e51666d4dc8",
1534-
"coordinates": None,
15351534
"text": "A Wonderful Story About A Fox",
15361535
"type": "Title",
15371536
},
@@ -1540,7 +1539,6 @@ The output will look like:
15401539
{
15411540
"data": {
15421541
"element_id": "8275769fdd1804f9f2b55ad3c9b0ef1b",
1543-
"coordinates": None,
15441542
"text": "A fox ran into the chicken coop and the chickens flew off!",
15451543
"type": "NarrativeText",
15461544
},

docs/source/getting_started.rst

Lines changed: 1 addition & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -110,43 +110,6 @@ The following code shows how you can limit your output to only narrative text wi
110110
print("\n")
111111
112112
113-
####################
114-
Element coordinates
115-
####################
116-
117-
Some document types support location data for the elements, usually in the form of bounding boxes.
118-
The ``coordinates`` property of an ``Element`` stores the coordinates of the corners of the
119-
bounding box starting from the top left corner and proceeding counter-clockwise. If the
120-
coordinates are not available, the ``coordinates`` property is ``None``.
121-
122-
The coordinates have an associated coordinate system. A typical example of a coordinate system is
123-
``PixelSpace``, which is used for representing the coordinates of images. The coordinates
124-
represent pixels, the origin is in the top left and the ``y`` coordinate increases in the
125-
downward direction. Information about the coordinate system is found in the
126-
``Element.coordinate_system`` property, including the coordinate system name, a description, the
127-
layout width, and the layout height.
128-
129-
The coordinates of an element can be changed to a new coordinate system by using the
130-
``Element.convert_coordinates_to_new_system`` method. If the ``in_place`` flag is ``True``, the
131-
coordinate system and coordinates of the element are updated in place and the new coordinates are
132-
returned. If the ``in_place`` flag is ``False``, only the altered coordinates are returned.
133-
134-
.. code:: python
135-
136-
from unstructured.documents.elements import Element
137-
from unstructured.documents.coordinates import PixelSpace, RelativeCoordinateSystem
138-
139-
coordinates = ((10, 10), (10, 100), (200, 100), (200, 10))
140-
coordinate_system = PixelSpace(width=850, height=1100)
141-
element = Element(coordinates=coordinates, coordinate_system=coordinate_system)
142-
print(element.coordinate_system)
143-
print(element.coordinate)
144-
element.convert_coordinates_to_new_system(RelativeCoordinateSystem(), in_place=True)
145-
# Should now be in terms of new coordinate system
146-
print(element.coordinate_system)
147-
print(element.coordinate)
148-
149-
150113
####################
151114
Serializing Elements
152115
####################
@@ -173,7 +136,7 @@ serializing and deserializing an ``Element`` list.
173136
174137
elements_to_json(elements, filename=filename)
175138
new_elements = elements_from_json(filename=filename)
176-
139+
177140
# alternatively, one can also serialize/deserialize to/from a string with:
178141
serialized_elements_json = elements_to_json(elements)
179142
new_elements = elements_from_json(text=serialized_elements_json)

docs/source/metadata.rst

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,48 @@ the source file:
1818
* ``page_number``
1919

2020

21+
####################
22+
Element coordinates
23+
####################
24+
25+
Some document types support location data for the elements, usually in the form of bounding boxes.
26+
If it exists, an element's location data is available with ``element.metadata.coordinates``.
27+
28+
The ``coordinates`` property of an ``ElementMetadata`` stores:
29+
* points: These specify the corners of the bounding box starting from the top left corner and
30+
proceeding counter-clockwise. The points represent pixels, the origin is in the top left and
31+
the ``y`` coordinate increases in the downward direction.
32+
* system: The points have an associated coordinate system. A typical example of a coordinate system is
33+
``PixelSpace``, which is used for representing the coordinates of images. The coordinate system has a
34+
name, orientation, layout width, and layout height.
35+
36+
Information about the element’s coordinates (including the coordinate system name, coordinate points,
37+
the layout width, and the layout height) can be accessed with `element.to_dict()["metadata"]["coordinates"]`.
38+
39+
The coordinates of an element can be changed to a new coordinate system by using the
40+
``Element.convert_coordinates_to_new_system`` method. If the ``in_place`` flag is ``True``, the
41+
coordinate system and points of the element are updated in place and the new coordinates are
42+
returned. If the ``in_place`` flag is ``False``, only the altered coordinates are returned.
43+
44+
.. code:: python
45+
46+
from unstructured.documents.elements import Element
47+
from unstructured.documents.coordinates import PixelSpace, RelativeCoordinateSystem
48+
49+
coordinates = ((10, 10), (10, 100), (200, 100), (200, 10))
50+
coordinate_system = PixelSpace(width=850, height=1100)
51+
element = Element(coordinates=coordinates, coordinate_system=coordinate_system)
52+
print(element.metadata.coordinates.to_dict())
53+
print(element.metadata.coordinates.system.orientation)
54+
print(element.metadata.coordinates.system.width)
55+
print(element.metadata.coordinates.system.height)
56+
element.convert_coordinates_to_new_system(RelativeCoordinateSystem(), in_place=True)
57+
# Should now be in terms of new coordinate system
58+
print(element.metadata.coordinates.to_dict())
59+
print(element.metadata.coordinates.system.orientation)
60+
print(element.metadata.coordinates.system.width)
61+
print(element.metadata.coordinates.system.height)
62+
2163
Email
2264
-----
2365

test_unstructured/documents/test_elements.py

Lines changed: 64 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
Orientation,
1010
RelativeCoordinateSystem,
1111
)
12-
from unstructured.documents.elements import Element, NoID, Text
12+
from unstructured.documents.elements import CoordinatesMetadata, Element, NoID, Text
1313

1414

1515
def test_text_id():
@@ -89,34 +89,80 @@ def test_convert_coordinates_to_new_system(
8989
for new_coord, expected_coord in zip(new_coords, expected_coords):
9090
new_coord == pytest.approx(expected_coord)
9191
element.convert_coordinates_to_new_system(coord2, in_place=True)
92-
for new_coord, expected_coord in zip(element.coordinates, expected_coords):
92+
for new_coord, expected_coord in zip(element.metadata.coordinates.points, expected_coords):
9393
assert new_coord == pytest.approx(expected_coord)
94-
assert element._coordinate_system == coord2
94+
assert element.metadata.coordinates.system == coord2
9595

9696

97-
@pytest.mark.parametrize(
98-
("coordinates", "coordinate_system"),
99-
[
100-
(None, None),
101-
(((1, 2), (1, 4), (3, 4), (3, 2)), None),
102-
(None, RelativeCoordinateSystem()),
103-
],
104-
)
105-
def test_convert_coordinate_to_new_system_none(coordinates, coordinate_system):
106-
element = Element(coordinates=coordinates, coordinate_system=coordinate_system)
97+
def test_convert_coordinate_to_new_system_none():
98+
element = Element(coordinates=None, coordinate_system=None)
10799
coord = CoordinateSystem(100, 200)
108100
coord.orientation = Orientation.SCREEN
109101
assert element.convert_coordinates_to_new_system(coord) is None
110102

111103

112-
def test_coordinate_system():
104+
def test_element_constructor_coordinates_all_present():
113105
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
114106
coordinate_system = RelativeCoordinateSystem()
115107
element = Element(coordinates=coordinates, coordinate_system=coordinate_system)
108+
expected_coordinates_metadata = CoordinatesMetadata(
109+
points=coordinates,
110+
system=coordinate_system,
111+
)
112+
assert element.metadata.coordinates == expected_coordinates_metadata
113+
114+
115+
def test_element_constructor_coordinates_points_absent():
116+
with pytest.raises(ValueError) as exc_info:
117+
Element(coordinate_system=RelativeCoordinateSystem())
118+
assert (
119+
str(exc_info.value)
120+
== "Coordinates points should not exist without coordinates system and vice versa."
121+
)
122+
123+
124+
def test_element_constructor_coordinates_system_absent():
125+
with pytest.raises(ValueError) as exc_info:
126+
Element(coordinates=((1, 2), (1, 4), (3, 4), (3, 2)))
127+
assert (
128+
str(exc_info.value)
129+
== "Coordinates points should not exist without coordinates system and vice versa."
130+
)
131+
132+
133+
def test_coordinate_metadata_serdes():
134+
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
135+
coordinate_system = RelativeCoordinateSystem()
136+
coordinates_metadata = CoordinatesMetadata(points=coordinates, system=coordinate_system)
116137
expected_schema = {
117-
"name": "RelativeCoordinateSystem",
118-
"description": RelativeCoordinateSystem.__doc__,
119-
"layout_width": 1,
120138
"layout_height": 1,
139+
"layout_width": 1,
140+
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
141+
"system": "RelativeCoordinateSystem",
142+
}
143+
coordinates_metadata_dict = coordinates_metadata.to_dict()
144+
assert coordinates_metadata_dict == expected_schema
145+
assert CoordinatesMetadata.from_dict(coordinates_metadata_dict) == coordinates_metadata
146+
147+
148+
def test_element_to_dict():
149+
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
150+
coordinate_system = RelativeCoordinateSystem()
151+
element = Element(
152+
element_id="awt32t1",
153+
coordinates=coordinates,
154+
coordinate_system=coordinate_system,
155+
)
156+
expected = {
157+
"metadata": {
158+
"coordinates": {
159+
"layout_height": 1,
160+
"layout_width": 1,
161+
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
162+
"system": "RelativeCoordinateSystem",
163+
},
164+
},
165+
"type": None,
166+
"element_id": "awt32t1",
121167
}
122-
assert element.coordinate_system == expected_schema
168+
assert element.to_dict() == expected

test_unstructured/partition/test_auto.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -214,11 +214,6 @@ def test_auto_partition_json_from_filename():
214214
json_data = json.load(json_f)
215215
json_elems = json.loads(elements_to_json(partition(filename=filename, strategy="hi_res")))
216216
for elem in json_elems:
217-
# coordinates are always in the element data structures, even if None
218-
elem.pop("coordinates")
219-
elem.pop("coordinate_system")
220-
elem.pop("layout_width")
221-
elem.pop("layout_height")
222217
elem.pop("metadata")
223218
for elem in json_data:
224219
elem.pop("metadata")

0 commit comments

Comments
 (0)