Skip to content

Commit 56ce657

Browse files
authored
fix: correct column ordering for multi-column documents (#112)
* first pass on ordering algorithm * doc strings and type hints * added test for multicolumn layouts * version and changelog * handling for empty layout * update version
1 parent c295a50 commit 56ce657

File tree

7 files changed

+128
-4
lines changed

7 files changed

+128
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
* Preserve image format in PIL.Image.Image when loading
44
* Added ONNX version of Detectron2 and make default model
55
* Remove API code, we don't serve this as a standalone API any more
6+
* Update ordering logic to account for multicolumn documents.
67

78
## 0.4.4
89

sample-docs/design-thinking.pdf

6.22 MB
Binary file not shown.

test_unstructured_inference/inference/test_layout.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,3 +458,18 @@ def test_load_pdf_image_placement():
458458
# Image is in top half of the page, so that should be reflected in the pixel coordinates
459459
assert image_region.y1 < images[5].height / 2
460460
assert image_region.y2 < images[5].height / 2
461+
462+
463+
def test_load_pdf_with_multicolumn_layout_and_ocr(filename="sample-docs/design-thinking.pdf"):
464+
layouts, images = layout.load_pdf(filename)
465+
doc = layout.process_file_with_model(filename=filename, model_name=None)
466+
test_snippets = ["Key to design thinking", "Design thinking also", "But in recent years"]
467+
468+
test_elements = []
469+
for element in doc.pages[0].elements:
470+
for snippet in test_snippets:
471+
if element.text.startswith(snippet):
472+
test_elements.append(element)
473+
474+
for i, element in enumerate(test_elements):
475+
assert element.text.startswith(test_snippets[i])

test_unstructured_inference/models/test_tables.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ def test_table_prediction(model_path, sample_table_transcript, platform_type):
415415
table_model.initialize(model=model_path)
416416
img = Image.open("./sample-docs/example_table.jpg").convert("RGB")
417417
prediction = table_model.predict(img)
418-
assert prediction == sample_table_transcript
418+
assert prediction.strip() == sample_table_transcript.strip()
419419

420420

421421
def test_intersect():

unstructured_inference/inference/elements.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,16 @@ def height(self) -> Union[int, float]:
6060
"""Height of rectangle"""
6161
return self.y2 - self.y1
6262

63+
@property
64+
def x_midpoint(self) -> Union[int, float]:
65+
"""Finds the horizontal midpoint of the object."""
66+
return (self.x2 + self.x1) / 2
67+
68+
@property
69+
def y_midpoint(self) -> Union[int, float]:
70+
"""Finds the vertical midpoint of the object."""
71+
return (self.y2 + self.y1) / 2
72+
6373
def is_disjoint(self, other: Rectangle) -> bool:
6474
"""Checks whether this rectangle is disjoint from another rectangle."""
6575
return not self.intersects(other)

unstructured_inference/inference/layout.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
ImageTextRegion,
1515
)
1616
from unstructured_inference.inference.layoutelement import LayoutElement
17+
from unstructured_inference.inference.ordering import order_layout
1718
from unstructured_inference.logger import logger
1819
from unstructured_inference.models.base import get_model
1920
from unstructured_inference.models.unstructuredmodel import UnstructuredModel
@@ -170,9 +171,7 @@ def get_elements_with_model(self, inplace=True) -> Optional[List[LayoutElement]]
170171
def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutElement]:
171172
"""Uses the given Layout to separate the page text into elements, either extracting the
172173
text from the discovered layout blocks or from the image using OCR."""
173-
# NOTE(robinson) - This orders the page from top to bottom. We'll need more
174-
# sophisticated ordering logic for more complicated layouts.
175-
layout.sort(key=lambda element: element.y1)
174+
layout = order_layout(layout)
176175
elements = [
177176
get_element_from_block(
178177
block=e,
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
from typing import List, Union
2+
3+
from unstructured_inference.inference.elements import TextRegion
4+
5+
6+
class Column:
7+
"""Class to capture a column of text in the layout. Will update the midpoint of the
8+
column as layout elements are added to help with new element comparisons."""
9+
10+
def __init__(self, layout_elements: List[TextRegion] = []):
11+
self.layout_elements = layout_elements
12+
13+
num_elements = len(layout_elements)
14+
if num_elements > 0:
15+
self.x_midpoint = sum([el.x_midpoint for el in layout_elements]) / num_elements
16+
else:
17+
self.x_midpoint = 0
18+
19+
def add_element(self, layout_element: TextRegion):
20+
"""Adds an elements to the column and updates the midpoint."""
21+
self.layout_elements.append(layout_element)
22+
num_elements = len(self.layout_elements)
23+
self.x_midpoint = sum([el.x_midpoint for el in self.layout_elements]) / num_elements
24+
25+
26+
def order_layout(
27+
layout: List[TextRegion],
28+
column_tol_factor: float = 0.2,
29+
full_page_threshold_factor: float = 0.9,
30+
) -> List[TextRegion]:
31+
"""Orders the layout elements detected on a page. For groups of elements that are not
32+
the width of the page, the algorithm attempts to group elements into column based on
33+
the coordinates of the bounding box. Columns are ordered left to right, and elements
34+
within columns are ordered top to bottom.
35+
36+
Parameters
37+
----------
38+
layout
39+
the layout elements to order.
40+
column_tol_factor
41+
multiplied by the page width to find the tolerance for considering two elements as
42+
part of the same column.
43+
full_page_threshold_factor
44+
multiplied by the page width to find the minimum width an elements need to be
45+
for it to be considered a full page width element.
46+
"""
47+
if len(layout) == 0:
48+
return []
49+
50+
width = calculate_width(layout)
51+
column_tolerance = column_tol_factor * width
52+
full_page_min_width = full_page_threshold_factor * width
53+
54+
layout.sort(key=lambda element: element.y1)
55+
56+
sorted_layout = []
57+
columns: List[Column] = []
58+
for layout_element in layout:
59+
if layout_element.width > full_page_min_width:
60+
sorted_layout.extend(sorted_layout_from_columns(columns))
61+
columns = []
62+
sorted_layout.append(layout_element)
63+
64+
else:
65+
added_to_column = False
66+
for column in columns:
67+
difference = abs(layout_element.x_midpoint - column.x_midpoint)
68+
if difference < column_tolerance:
69+
column.add_element(layout_element)
70+
added_to_column = True
71+
break
72+
73+
if not added_to_column:
74+
columns.append(Column(layout_elements=[layout_element]))
75+
76+
sorted_layout.extend(sorted_layout_from_columns(columns))
77+
return sorted_layout
78+
79+
80+
def sorted_layout_from_columns(columns: List[Column]) -> List[TextRegion]:
81+
"""Creates a sorted list of elements from a list of columns. Columns will be sorted
82+
left to right and elements within columns are sorted top to bottom."""
83+
sorted_layout = []
84+
if len(columns) > 0:
85+
columns.sort(key=lambda column: column.x_midpoint)
86+
for column in columns:
87+
column.layout_elements.sort(key=lambda element: element.y1)
88+
for layout_element in column.layout_elements:
89+
sorted_layout.append(layout_element)
90+
return sorted_layout
91+
92+
93+
def calculate_width(layout) -> Union[float, int]:
94+
"""Calculates total width of the elements in the layout. Used for computing the full
95+
page threshold and column tolerance."""
96+
min_x1 = min([element.x1 for element in layout])
97+
max_x2 = max([element.x2 for element in layout])
98+
99+
return max_x2 - min_x1

0 commit comments

Comments
 (0)