Skip to content

Commit 3d2045e

Browse files
Feat/176 elements detected by ocr supplement detected layout (#186)
Closes [#176](#176). ### Summary - Add functionality to annotate any layout(extracted, inferred, OCR) on a page - Add a Python script and a Jupyter Notebook for annotating layout elements - Add functionality to get layout elements from OCR regions - Add functionality to supplement detected layout with elements from the full-page OCR - Add the utility function `annotate_layout_elements()` - Add a Python script and a Jupyter Notebook to evaluate the feature implemented in this branch ### Testing ``` from unstructured_inference.inference.layout import DocumentLayout # pdf doc = DocumentLayout.from_file("sample-docs/patent-1p.pdf") # image doc = DocumentLayout.from_image_file("sample-docs/layout-parser-paper-fast.jpg") ``` ### Evaluation A Python script and a Jupyter Notebook were added for evaluation. ``` PYTHONPATH=. python examples/ocr_layout_supplement/ocr_layout_supplement.py <file_path> <file_type> ``` For example, ``` PYTHONPATH=. python examples/ocr_layout_supplement/ocr_layout_supplement.py sample-docs/patent-1p.pdf pdf ```
1 parent b139d3b commit 3d2045e

File tree

28 files changed

+1091
-184
lines changed

28 files changed

+1091
-184
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,4 @@ dmypy.json
143143
.vscode/
144144

145145
sample-docs/*_images
146+
examples/**/output

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.5.19
2+
3+
* Add functionality to supplement detected layout with elements from the full page OCR
4+
* Add functionality to annotate any layout(extracted, inferred, OCR) on a page
5+
16
## 0.5.18
27

38
* Fix for incorrect type assignation at ingest test

examples/layout_analysis/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Analyzing Layout Elements
2+
3+
This directory contains examples of how to analyze layout elements.
4+
5+
## How to run
6+
7+
Run `pip install -r requirements.txt` to install the Python dependencies.
8+
9+
### Visualization
10+
- Python script (visualization.py)
11+
```
12+
PYTHONPATH=. python examples/layout_analysis/visualization.py <file_path>
13+
```
14+
For example,
15+
```
16+
PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/loremipsum.pdf
17+
```
18+
- Jupyter Notebook (visualization.ipynb)
19+
- Run `jupyter-notebook` to start.
20+
- Run the `visualization.ipynb` notebook.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
unstructured-inference
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import os\n",
10+
"\n",
11+
"from unstructured_inference.inference.layout import process_file_with_model\n",
12+
"from unstructured_inference.visualize import show_plot"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": null,
18+
"outputs": [],
19+
"source": [
20+
"annotation_data_map = {\n",
21+
" \"final\": None,\n",
22+
" \"extracted\": {\"layout\": {\"color\": \"green\", \"width\": 2}},\n",
23+
" \"inferred\": {\"inferred_layout\": {\"color\": \"blue\", \"width\": 2}},\n",
24+
" \"ocr\": {\"ocr_layout\": {\"color\": \"yellow\", \"width\": 2}},\n",
25+
"}"
26+
],
27+
"metadata": {
28+
"collapsed": false
29+
}
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": null,
34+
"metadata": {},
35+
"outputs": [],
36+
"source": [
37+
"f_path = \"../../sample-docs/loremipsum.pdf\"\n",
38+
"f_name = os.path.basename(f_path)"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": null,
44+
"metadata": {},
45+
"outputs": [],
46+
"source": [
47+
"doc = process_file_with_model(\n",
48+
" f_path,\n",
49+
" model_name=None,\n",
50+
" analysis=True,\n",
51+
")"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": null,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"for idx, page in enumerate(doc.pages):\n",
61+
" for action_type, action_value in annotation_data_map.items():\n",
62+
" img = page.annotate(annotation_data=action_value)\n",
63+
" if action_value is None:\n",
64+
" n_layout_elements = len(page.elements)\n",
65+
" else:\n",
66+
" attribute = list(action_value.keys())[0]\n",
67+
" n_layout_elements = len(getattr(page, attribute))\n",
68+
" print(f\"Filename: {f_name} - Page: {idx+1} - Layout: {action_type} - n_layout_elements: {n_layout_elements}\")\n",
69+
" show_plot(img, desired_width=14)"
70+
]
71+
}
72+
],
73+
"metadata": {
74+
"kernelspec": {
75+
"display_name": "Python 3 (ipykernel)",
76+
"language": "python",
77+
"name": "python3"
78+
},
79+
"language_info": {
80+
"codemirror_mode": {
81+
"name": "ipython",
82+
"version": 3
83+
},
84+
"file_extension": ".py",
85+
"mimetype": "text/x-python",
86+
"name": "python",
87+
"nbconvert_exporter": "python",
88+
"pygments_lexer": "ipython3",
89+
"version": "3.8.15"
90+
}
91+
},
92+
"nbformat": 4,
93+
"nbformat_minor": 1
94+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import os
2+
import pathlib
3+
import sys
4+
5+
from unstructured_inference.inference.layout import process_file_with_model
6+
from unstructured_inference.utils import write_image
7+
8+
CUR_DIR = pathlib.Path(__file__).parent.resolve()
9+
10+
11+
def run(f_path):
12+
annotation_data_map = {
13+
"final": None,
14+
"extracted": {"layout": {"color": "green", "width": 2}},
15+
"inferred": {"inferred_layout": {"color": "blue", "width": 2}},
16+
"ocr": {"ocr_layout": {"color": "yellow", "width": 2}},
17+
}
18+
19+
f_basename = os.path.splitext(os.path.basename(f_path))[0]
20+
output_dir_path = os.path.join(output_basedir_path, f_basename)
21+
os.makedirs(output_dir_path, exist_ok=True)
22+
23+
doc = process_file_with_model(
24+
f_path,
25+
model_name=None,
26+
analysis=True,
27+
)
28+
29+
for idx, page in enumerate(doc.pages):
30+
for action_type, action_value in annotation_data_map.items():
31+
img = page.annotate(annotation_data=action_value)
32+
output_f_path = os.path.join(output_dir_path, f"{f_basename}_{idx+1}_{action_type}.jpg")
33+
write_image(img, output_f_path)
34+
35+
36+
if __name__ == '__main__':
37+
if len(sys.argv) < 2:
38+
print(
39+
"Please provide the path to the file name as the first argument and the strategy as the "
40+
"second argument.",
41+
)
42+
sys.exit(1)
43+
44+
output_basedir_path = os.path.join(CUR_DIR, "output")
45+
os.makedirs(output_basedir_path, exist_ok=True)
46+
47+
run(f_path=sys.argv[1])
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Supplementing detected layout with elements from the full-page OCR
2+
3+
This directory contains examples of how to analyze layout elements.
4+
5+
## Running the example
6+
7+
Run `pip install -r requirements.txt` to install the Python dependencies.
8+
9+
### Running python script
10+
```
11+
PYTHONPATH=. python examples/ocr_layout_supplement/ocr_layout_supplement.py <file_path> <file_type>
12+
```
13+
For example,
14+
```
15+
PYTHONPATH=. python examples/ocr_layout_supplement/ocr_layout_supplement.py sample-docs/patent-1p.pdf pdf
16+
```
17+
### Running jupyter notebook
18+
- Run `jupyter-notebook` to start.
19+
- Run the `visualization.ipynb` notebook.
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"outputs": [],
7+
"source": [
8+
"import os\n",
9+
"import sys\n",
10+
"\n",
11+
"from unstructured_inference.constants import AnnotationResult\n",
12+
"from unstructured_inference.inference.layout import process_file_with_model\n",
13+
"from unstructured_inference.utils import annotate_layout_elements"
14+
],
15+
"metadata": {
16+
"collapsed": false
17+
}
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": null,
22+
"outputs": [],
23+
"source": [
24+
"output_basedir_path = \"output\""
25+
],
26+
"metadata": {
27+
"collapsed": false
28+
}
29+
},
30+
{
31+
"cell_type": "code",
32+
"execution_count": null,
33+
"outputs": [],
34+
"source": [
35+
"def run(f_path, f_type):\n",
36+
" if f_type == \"pdf\":\n",
37+
" is_image = False\n",
38+
" elif f_type == \"image\":\n",
39+
" is_image = True\n",
40+
" else:\n",
41+
" print(\"Invalid file type.\")\n",
42+
" sys.exit(1)\n",
43+
"\n",
44+
" annotation_data_map = {\n",
45+
" \"final\": None,\n",
46+
" }\n",
47+
" actions = [False, True]\n",
48+
" for action in actions:\n",
49+
" _f_basename = os.path.splitext(os.path.basename(f_path))[0]\n",
50+
" output_dir_path = os.path.join(output_basedir_path, f\"{_f_basename}_{file_type}\")\n",
51+
" os.makedirs(output_dir_path, exist_ok=True)\n",
52+
"\n",
53+
" f_basename = f\"updated_{_f_basename}\" if action else f\"original_{_f_basename}\"\n",
54+
"\n",
55+
" label = \"Updated Results: \" if action else \"Original Results: \"\n",
56+
" print(label)\n",
57+
"\n",
58+
" doc = process_file_with_model(\n",
59+
" f_path,\n",
60+
" is_image=is_image,\n",
61+
" model_name=None,\n",
62+
" supplement_with_ocr_elements=action,\n",
63+
" analysis=True,\n",
64+
" )\n",
65+
"\n",
66+
" annotate_layout_elements(doc, annotation_data_map, output_dir_path, f_basename, AnnotationResult.PLOT)"
67+
],
68+
"metadata": {
69+
"collapsed": false
70+
}
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": null,
75+
"outputs": [],
76+
"source": [
77+
"file_path = \"../../sample-docs/patent-1p.pdf\"\n",
78+
"file_type = \"pdf\"\n",
79+
"f_name = os.path.basename(file_path)\n",
80+
"print(f\"file_name: {f_name} - file_type: {file_type}\")\n",
81+
"\n",
82+
"run(file_path, file_type)"
83+
],
84+
"metadata": {
85+
"collapsed": false
86+
}
87+
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": null,
91+
"outputs": [],
92+
"source": [
93+
"file_path = \"../../sample-docs/layout-parser-paper-fast.jpg\"\n",
94+
"file_type = \"image\"\n",
95+
"f_name = os.path.basename(file_path)\n",
96+
"print(f\"file_name: {f_name} - file_type: {file_type}\")\n",
97+
"\n",
98+
"run(file_path, file_type)"
99+
],
100+
"metadata": {
101+
"collapsed": false
102+
}
103+
}
104+
],
105+
"metadata": {
106+
"kernelspec": {
107+
"display_name": "Python 3",
108+
"language": "python",
109+
"name": "python3"
110+
},
111+
"language_info": {
112+
"codemirror_mode": {
113+
"name": "ipython",
114+
"version": 2
115+
},
116+
"file_extension": ".py",
117+
"mimetype": "text/x-python",
118+
"name": "python",
119+
"nbconvert_exporter": "python",
120+
"pygments_lexer": "ipython2",
121+
"version": "2.7.6"
122+
}
123+
},
124+
"nbformat": 4,
125+
"nbformat_minor": 0
126+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import os
2+
import pathlib
3+
import sys
4+
5+
from unstructured_inference.constants import AnnotationResult
6+
from unstructured_inference.inference.layout import process_file_with_model
7+
from unstructured_inference.utils import annotate_layout_elements
8+
9+
CUR_DIR = pathlib.Path(__file__).parent.resolve()
10+
11+
12+
def run(f_path, file_type):
13+
print(">>> Start...")
14+
print(f">>> file_path: {f_path} - file_type: {file_type}")
15+
16+
if file_type == "pdf":
17+
is_image = False
18+
elif file_type == "image":
19+
is_image = True
20+
else:
21+
print("Invalid file type.")
22+
sys.exit(1)
23+
24+
annotation_data_map = {
25+
"final": None,
26+
}
27+
28+
actions = [False, True]
29+
for action in actions:
30+
_f_basename = os.path.splitext(os.path.basename(f_path))[0]
31+
output_dir_path = os.path.join(output_basedir_path, f"{_f_basename}_{file_type}")
32+
os.makedirs(output_dir_path, exist_ok=True)
33+
34+
f_basename = f"updated_{_f_basename}" if action else f"original_{_f_basename}"
35+
36+
doc = process_file_with_model(
37+
f_path,
38+
is_image=is_image,
39+
model_name=None,
40+
supplement_with_ocr_elements=action,
41+
analysis=True,
42+
)
43+
44+
annotate_layout_elements(doc, annotation_data_map, output_dir_path, f_basename, AnnotationResult.IMAGE)
45+
46+
print("<<< Finished")
47+
48+
49+
if __name__ == '__main__':
50+
if len(sys.argv) < 3:
51+
print(
52+
"Please provide the path to the file name as the first argument and the strategy as the "
53+
"second argument.",
54+
)
55+
sys.exit(1)
56+
57+
output_basedir_path = os.path.join(CUR_DIR, "output")
58+
os.makedirs(output_basedir_path, exist_ok=True)
59+
60+
run(f_path=sys.argv[1], file_type=sys.argv[2])

0 commit comments

Comments
 (0)