Skip to content

Commit 6494128

Browse files
authored
fix: temporarily disable multicolumn logic (#120)
Temporarily short circuiting the new multi-column ordering logic after seeing examples of it misordering elements where it wasn't before.
1 parent 42139eb commit 6494128

File tree

4 files changed

+29
-72
lines changed

4 files changed

+29
-72
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
## 0.5.1-dev2
1+
## 0.5.1
22

33
* Add annotation for pages
44
* Store page numbers when processing PDFs
55
* Hotfix to handle inference of blank pages using ONNX detectron2
6+
* Revert ordering change to investigate examples of misordering
67

78
## 0.5.0
89

test_unstructured_inference/inference/test_layout.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,3 +506,25 @@ def test_annotate():
506506
assert ((annotated_array[:, :, 0] == 1).mean()) > 0.992
507507
assert ((annotated_array[:, :, 1] == 1).mean()) > 0.992
508508
assert ((annotated_array[:, :, 2] == 1).mean()) > 0.992
509+
510+
511+
@pytest.fixture
512+
def ordering_layout():
513+
elements = [
514+
layout.LayoutElement(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
515+
layout.LayoutElement(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
516+
layout.LayoutElement(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
517+
layout.LayoutElement(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"),
518+
layout.LayoutElement(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"),
519+
layout.LayoutElement(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"),
520+
layout.LayoutElement(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"),
521+
]
522+
return elements
523+
524+
525+
def test_layout_order(ordering_layout):
526+
with patch.object(layout, "get_model", lambda: lambda x: ordering_layout):
527+
doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf")
528+
page = doc.pages[0]
529+
for n, element in enumerate(page.elements):
530+
assert element.text == str(n)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.1-dev2" # pragma: no cover
1+
__version__ = "0.5.1" # pragma: no cover
Lines changed: 4 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,8 @@
1-
from typing import List, Union
1+
from typing import List
22

33
from unstructured_inference.inference.elements import TextRegion
44

55

6-
class Column:
7-
"""Class to capture a column of text in the layout. Will update the midpoint of the
8-
column as layout elements are added to help with new element comparisons."""
9-
10-
def __init__(self, layout_elements: List[TextRegion] = []):
11-
self.layout_elements = layout_elements
12-
13-
num_elements = len(layout_elements)
14-
if num_elements > 0:
15-
self.x_midpoint = sum([el.x_midpoint for el in layout_elements]) / num_elements
16-
else:
17-
self.x_midpoint = 0
18-
19-
def add_element(self, layout_element: TextRegion):
20-
"""Adds an elements to the column and updates the midpoint."""
21-
self.layout_elements.append(layout_element)
22-
num_elements = len(self.layout_elements)
23-
self.x_midpoint = sum([el.x_midpoint for el in self.layout_elements]) / num_elements
24-
25-
266
def order_layout(
277
layout: List[TextRegion],
288
column_tol_factor: float = 0.2,
@@ -47,53 +27,7 @@ def order_layout(
4727
if len(layout) == 0:
4828
return []
4929

50-
width = calculate_width(layout)
51-
column_tolerance = column_tol_factor * width
52-
full_page_min_width = full_page_threshold_factor * width
53-
5430
layout.sort(key=lambda element: element.y1)
55-
56-
sorted_layout = []
57-
columns: List[Column] = []
58-
for layout_element in layout:
59-
if layout_element.width > full_page_min_width:
60-
sorted_layout.extend(sorted_layout_from_columns(columns))
61-
columns = []
62-
sorted_layout.append(layout_element)
63-
64-
else:
65-
added_to_column = False
66-
for column in columns:
67-
difference = abs(layout_element.x_midpoint - column.x_midpoint)
68-
if difference < column_tolerance:
69-
column.add_element(layout_element)
70-
added_to_column = True
71-
break
72-
73-
if not added_to_column:
74-
columns.append(Column(layout_elements=[layout_element]))
75-
76-
sorted_layout.extend(sorted_layout_from_columns(columns))
77-
return sorted_layout
78-
79-
80-
def sorted_layout_from_columns(columns: List[Column]) -> List[TextRegion]:
81-
"""Creates a sorted list of elements from a list of columns. Columns will be sorted
82-
left to right and elements within columns are sorted top to bottom."""
83-
sorted_layout = []
84-
if len(columns) > 0:
85-
columns.sort(key=lambda column: column.x_midpoint)
86-
for column in columns:
87-
column.layout_elements.sort(key=lambda element: element.y1)
88-
for layout_element in column.layout_elements:
89-
sorted_layout.append(layout_element)
90-
return sorted_layout
91-
92-
93-
def calculate_width(layout) -> Union[float, int]:
94-
"""Calculates total width of the elements in the layout. Used for computing the full
95-
page threshold and column tolerance."""
96-
min_x1 = min([element.x1 for element in layout])
97-
max_x2 = max([element.x2 for element in layout])
98-
99-
return max_x2 - min_x1
31+
# NOTE(alan): Temporarily revert to orginal logic pending fixing the new logic
32+
# See code prior to this commit for new logic.
33+
return layout

0 commit comments

Comments
 (0)