Skip to content

Commit 15bbc56

Browse files
Feat/147 ocr entire page (#159)
Addresses Github issue #147 * Add functionality to Extracts the text aggregated from the regions of the ocr layout that lie within the given block * Add functionality to merge inferred layout with ocr layout * Add functionality to populate inferred region text with ocr text when merging inferred layout with embedded layout * Populate inferred region text with ocr text only for inferred regions that are not populated with text * Make entire-page OCR optional * Update the evaluation script
1 parent 203f7ab commit 15bbc56

File tree

14 files changed

+499
-12
lines changed

14 files changed

+499
-12
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.10
2+
3+
* Implement full-page OCR
4+
15
## 0.5.9
26

37
* Handle exceptions from Tesseract

examples/ocr/engine.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import os
2+
import re
3+
import time
4+
from typing import List, cast
5+
6+
import cv2
7+
import numpy as np
8+
import pytesseract
9+
from pytesseract import Output
10+
11+
from unstructured_inference.inference import layout
12+
from unstructured_inference.inference.elements import Rectangle, TextRegion
13+
14+
15+
def remove_non_printable(s):
16+
dst_str = re.sub(r'[^\x20-\x7E]', ' ', s)
17+
return ' '.join(dst_str.split())
18+
19+
20+
def run_ocr_with_layout_detection(
21+
images,
22+
detection_model=None,
23+
element_extraction_model=None,
24+
mode="individual_blocks",
25+
output_dir="",
26+
drawable=True,
27+
printable=True,
28+
):
29+
total_text_extraction_infer_time = 0
30+
total_extracted_text = {}
31+
for i, image in enumerate(images):
32+
page_num = i + 1
33+
page_num_str = f"page{page_num}"
34+
35+
page = layout.PageLayout(
36+
number=i+1,
37+
image=image,
38+
layout=None,
39+
detection_model=detection_model,
40+
element_extraction_model=element_extraction_model,
41+
)
42+
43+
inferred_layout: List[TextRegion] = cast(List[TextRegion], page.detection_model(page.image))
44+
45+
cv_img = np.array(image)
46+
47+
if mode == "individual_blocks":
48+
# OCR'ing individual blocks (current approach)
49+
text_extraction_start_time = time.time()
50+
51+
elements = page.get_elements_from_layout(inferred_layout)
52+
53+
text_extraction_infer_time = time.time() - text_extraction_start_time
54+
55+
total_text_extraction_infer_time += text_extraction_infer_time
56+
57+
page_text = ""
58+
for el in elements:
59+
page_text += el.text
60+
filtered_page_text = remove_non_printable(page_text)
61+
total_extracted_text[page_num_str] = filtered_page_text
62+
elif mode == "entire_page":
63+
# OCR'ing entire page (new approach to implement)
64+
text_extraction_start_time = time.time()
65+
66+
ocr_data = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT)
67+
boxes = ocr_data['level']
68+
extracted_text_list = []
69+
for k in range(len(boxes)):
70+
(x, y, w, h) = ocr_data['left'][k], ocr_data['top'][k], ocr_data['width'][k], ocr_data['height'][k]
71+
extracted_text = ocr_data['text'][k]
72+
if not extracted_text:
73+
continue
74+
75+
extracted_region = Rectangle(x1=x, y1=y, x2=x+w, y2=y+h)
76+
77+
extracted_is_subregion_of_inferred = False
78+
for inferred_region in inferred_layout:
79+
extracted_is_subregion_of_inferred = extracted_region.is_almost_subregion_of(
80+
inferred_region.pad(12),
81+
subregion_threshold=0.75,
82+
)
83+
if extracted_is_subregion_of_inferred:
84+
break
85+
86+
if extracted_is_subregion_of_inferred:
87+
extracted_text_list.append(extracted_text)
88+
89+
if drawable:
90+
if extracted_is_subregion_of_inferred:
91+
cv2.rectangle(cv_img, (x, y), (x + w, y + h), (0, 255, 0), 2, None)
92+
else:
93+
cv2.rectangle(cv_img, (x, y), (x + w, y + h), (255, 0, 0), 2, None)
94+
95+
text_extraction_infer_time = time.time() - text_extraction_start_time
96+
total_text_extraction_infer_time += text_extraction_infer_time
97+
98+
page_text = " ".join(extracted_text_list)
99+
filtered_page_text = remove_non_printable(page_text)
100+
total_extracted_text[page_num_str] = filtered_page_text
101+
else:
102+
raise ValueError("Invalid mode")
103+
104+
if drawable:
105+
for el in inferred_layout:
106+
pt1 = [int(el.x1), int(el.y1)]
107+
pt2 = [int(el.x2), int(el.y2)]
108+
cv2.rectangle(
109+
img=cv_img,
110+
pt1=pt1, pt2=pt2,
111+
color=(0, 0, 255),
112+
thickness=4,
113+
lineType=None,
114+
)
115+
116+
f_path = os.path.join(output_dir, f"ocr_{mode}_{page_num_str}.jpg")
117+
cv2.imwrite(f_path, cv_img)
118+
119+
if printable:
120+
print(f"page: {i + 1} - n_layout_elements: {len(inferred_layout)} - "
121+
f"text_extraction_infer_time: {text_extraction_infer_time}")
122+
123+
return total_text_extraction_infer_time, total_extracted_text
124+
125+
126+
def run_ocr(
127+
images,
128+
printable=True,
129+
):
130+
total_text_extraction_infer_time = 0
131+
total_text = ""
132+
for i, image in enumerate(images):
133+
text_extraction_start_time = time.time()
134+
135+
page_text = pytesseract.image_to_string(image)
136+
137+
text_extraction_infer_time = time.time() - text_extraction_start_time
138+
139+
if printable:
140+
print(f"page: {i + 1} - text_extraction_infer_time: {text_extraction_infer_time}")
141+
142+
total_text_extraction_infer_time += text_extraction_infer_time
143+
total_text += page_text
144+
145+
return total_text_extraction_infer_time, total_text

examples/ocr/output/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*

examples/ocr/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
unstructured[local-inference]
2+
nltk
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import json
2+
import os
3+
import time
4+
from datetime import datetime
5+
from difflib import SequenceMatcher
6+
7+
import nltk
8+
import pdf2image
9+
10+
from unstructured_inference.inference.layout import (
11+
DocumentLayout,
12+
create_image_output_dir,
13+
process_file_with_model,
14+
)
15+
16+
# Download the required resources (run this once)
17+
nltk.download('punkt')
18+
19+
20+
def validate_performance(
21+
f_name,
22+
validation_mode,
23+
is_image_file=False,
24+
):
25+
print(f">>> Start performance comparison - filename: {f_name} - validation_mode: {validation_mode}"
26+
f" - is_image_file: {is_image_file}")
27+
28+
now_dt = datetime.utcnow()
29+
now_str = now_dt.strftime("%Y_%m_%d-%H_%M_%S")
30+
31+
f_path = os.path.join(example_docs_dir, f_name)
32+
33+
image_f_paths = []
34+
if validation_mode == "pdf":
35+
pdf_info = pdf2image.pdfinfo_from_path(f_path)
36+
n_pages = pdf_info["Pages"]
37+
elif validation_mode == "image":
38+
if is_image_file:
39+
image_f_paths.append(f_path)
40+
else:
41+
image_output_dir = create_image_output_dir(f_path)
42+
images = pdf2image.convert_from_path(f_path, output_folder=image_output_dir)
43+
image_f_paths = [image.filename for image in images]
44+
n_pages = len(image_f_paths)
45+
else:
46+
n_pages = 0
47+
48+
processing_result = {}
49+
for ocr_mode in ["individual_blocks", "entire_page"]:
50+
start_time = time.time()
51+
52+
if validation_mode == "pdf":
53+
layout = process_file_with_model(
54+
f_path,
55+
model_name=None,
56+
ocr_mode=ocr_mode,
57+
)
58+
elif validation_mode == "image":
59+
pages = []
60+
for image_f_path in image_f_paths:
61+
_layout = process_file_with_model(
62+
image_f_path,
63+
model_name=None,
64+
ocr_mode=ocr_mode,
65+
is_image=True,
66+
)
67+
pages += _layout.pages
68+
for i, page in enumerate(pages):
69+
page.number = i + 1
70+
layout = DocumentLayout.from_pages(pages)
71+
else:
72+
layout = None
73+
74+
infer_time = time.time() - start_time
75+
76+
if layout is None:
77+
print("Layout is None")
78+
return
79+
80+
full_text = str(layout)
81+
page_text = {}
82+
for page in layout.pages:
83+
page_text[page.number] = str(page)
84+
85+
processing_result[ocr_mode] = {
86+
"infer_time": infer_time,
87+
"full_text": full_text,
88+
"page_text": page_text,
89+
}
90+
91+
individual_mode_page_text = processing_result["individual_blocks"]["page_text"]
92+
entire_mode_page_text = processing_result["individual_blocks"]["page_text"]
93+
individual_mode_full_text = processing_result["individual_blocks"]["full_text"]
94+
entire_mode_full_text = processing_result["entire_page"]["full_text"]
95+
96+
compare_result = compare_processed_text(individual_mode_full_text, entire_mode_full_text)
97+
98+
report = {
99+
"validation_mode": validation_mode,
100+
"file_info": {
101+
"filename": f_name,
102+
"n_pages": n_pages,
103+
},
104+
"processing_time": {
105+
"individual_blocks": processing_result["individual_blocks"]["infer_time"],
106+
"entire_page": processing_result["entire_page"]["infer_time"],
107+
},
108+
"text_similarity": compare_result,
109+
"extracted_text": {
110+
"individual_blocks": {
111+
"page_text": individual_mode_page_text,
112+
"full_text": individual_mode_full_text,
113+
},
114+
"entire_page": {
115+
"page_text": entire_mode_page_text,
116+
"full_text": entire_mode_full_text,
117+
},
118+
},
119+
}
120+
121+
write_report(report, now_str, validation_mode)
122+
123+
print("<<< End performance comparison", f_name)
124+
125+
126+
def compare_processed_text(individual_mode_full_text, entire_mode_full_text, delimiter=" "):
127+
# Calculate similarity ratio
128+
similarity_ratio = SequenceMatcher(None, individual_mode_full_text, entire_mode_full_text).ratio()
129+
130+
print(f"similarity_ratio: {similarity_ratio}")
131+
132+
# Tokenize the text into words
133+
word_list_individual = nltk.word_tokenize(individual_mode_full_text)
134+
n_word_list_individual = len(word_list_individual)
135+
print("n_word_list_in_text_individual:", n_word_list_individual)
136+
word_sets_individual = set(word_list_individual)
137+
n_word_sets_individual = len(word_sets_individual)
138+
print(f"n_word_sets_in_text_individual: {n_word_sets_individual}")
139+
# print("word_sets_merged:", word_sets_merged)
140+
141+
word_list_entire = nltk.word_tokenize(entire_mode_full_text)
142+
n_word_list_entire = len(word_list_entire)
143+
print("n_word_list_individual:", n_word_list_entire)
144+
word_sets_entire = set(word_list_entire)
145+
n_word_sets_entire = len(word_sets_entire)
146+
print(f"n_word_sets_individual: {n_word_sets_entire}")
147+
# print("word_sets_individual:", word_sets_individual)
148+
149+
# Find unique elements using difference
150+
print("diff_elements:")
151+
unique_words_individual = word_sets_individual - word_sets_entire
152+
unique_words_entire = word_sets_entire - word_sets_individual
153+
print(f"unique_words_in_text_individual: {unique_words_individual}\n")
154+
print(f"unique_words_in_text_entire: {unique_words_entire}")
155+
156+
return {
157+
"similarity_ratio": similarity_ratio,
158+
"individual_blocks": {
159+
"n_word_list": n_word_list_individual,
160+
"n_word_sets": n_word_sets_individual,
161+
"unique_words": delimiter.join(list(unique_words_individual)),
162+
},
163+
"entire_page": {
164+
"n_word_list": n_word_list_entire,
165+
"n_word_sets": n_word_sets_entire,
166+
"unique_words": delimiter.join(list(unique_words_entire)),
167+
},
168+
}
169+
170+
171+
def write_report(report, now_str, validation_mode):
172+
report_f_name = f"validate-ocr-{validation_mode}-{now_str}.json"
173+
report_f_path = os.path.join(output_dir, report_f_name)
174+
with open(report_f_path, "w", encoding="utf-8-sig") as f:
175+
json.dump(report, f, indent=4)
176+
177+
178+
def run():
179+
test_files = [
180+
{"name": "layout-parser-paper-fast.pdf", "mode": "image", "is_image_file": False},
181+
{"name": "loremipsum_multipage.pdf", "mode": "image", "is_image_file": False},
182+
{"name": "2023-Jan-economic-outlook.pdf", "mode": "image", "is_image_file": False},
183+
{"name": "recalibrating-risk-report.pdf", "mode": "image", "is_image_file": False},
184+
{"name": "Silent-Giant.pdf", "mode": "image", "is_image_file": False},
185+
]
186+
187+
for test_file in test_files:
188+
f_name = test_file["name"]
189+
validation_mode = test_file["mode"]
190+
is_image_file = test_file["is_image_file"]
191+
192+
validate_performance(f_name, validation_mode, is_image_file)
193+
194+
195+
if __name__ == '__main__':
196+
cur_dir = os.getcwd()
197+
base_dir = os.path.join(cur_dir, os.pardir, os.pardir)
198+
example_docs_dir = os.path.join(base_dir, "sample-docs")
199+
200+
# folder path to save temporary outputs
201+
output_dir = os.path.join(cur_dir, "output")
202+
os.makedirs(output_dir, exist_ok=True)
203+
204+
run()
2.11 MB
Binary file not shown.

sample-docs/Silent-Giant.pdf

5.88 MB
Binary file not shown.
93 KB
Loading
168 KB
Binary file not shown.
787 KB
Binary file not shown.

0 commit comments

Comments
 (0)