|
7 | 7 | from pathlib import Path |
8 | 8 | from typing import List, Optional, Union |
9 | 9 |
|
| 10 | +import cv2 |
10 | 11 | import numpy as np |
11 | 12 | import pandas as pd |
12 | 13 | import pytesseract |
|
17 | 18 | from unstructured_inference.config import inference_config |
18 | 19 | from unstructured_inference.logger import logger |
19 | 20 | from unstructured_inference.models.table_postprocess import Rect |
| 21 | +from unstructured_inference.models.tesseract import ( |
| 22 | + TESSERACT_TEXT_HEIGHT, |
| 23 | +) |
20 | 24 | from unstructured_inference.models.unstructuredmodel import UnstructuredModel |
21 | 25 | from unstructured_inference.utils import pad_image_with_background_color |
22 | 26 |
|
@@ -79,23 +83,45 @@ def get_tokens(self, x: Image): |
79 | 83 | ymax = max([i[1] for i in line[0]]) |
80 | 84 | tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]}) |
81 | 85 | else: |
| 86 | + zoom = 1 |
| 87 | + |
82 | 88 | logger.info("Processing table OCR with tesseract...") |
83 | 89 | ocr_df: pd.DataFrame = pytesseract.image_to_data( |
84 | 90 | x, |
85 | 91 | output_type="data.frame", |
86 | 92 | ) |
87 | | - |
88 | 93 | ocr_df = ocr_df.dropna() |
89 | 94 |
|
| 95 | + # tesseract performance degrades when the text height is out of the preferred zone so we |
| 96 | + # zoom the image (in or out depending on estimated text height) for optimum OCR results |
| 97 | + # but this needs to be evaluated based on actual use case as the optimum scaling also |
| 98 | + # depend on type of characters (font, language, etc); be careful about this |
| 99 | + # functionality |
| 100 | + text_height = ocr_df[TESSERACT_TEXT_HEIGHT].quantile( |
| 101 | + inference_config.TESSERACT_TEXT_HEIGHT_QUANTILE, |
| 102 | + ) |
| 103 | + if ( |
| 104 | + text_height < inference_config.TESSERACT_MIN_TEXT_HEIGHT |
| 105 | + or text_height > inference_config.TESSERACT_MAX_TEXT_HEIGHT |
| 106 | + ): |
| 107 | + # rounding avoids unnecessary precision and potential numerical issues assocaited |
| 108 | + # with numbers very close to 1 inside cv2 image processing |
| 109 | + zoom = np.round(inference_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1) |
| 110 | + ocr_df = pytesseract.image_to_data( |
| 111 | + zoom_image(x, zoom), |
| 112 | + output_type="data.frame", |
| 113 | + ) |
| 114 | + ocr_df = ocr_df.dropna() |
| 115 | + |
90 | 116 | tokens = [] |
91 | 117 | for idtx in ocr_df.itertuples(): |
92 | 118 | tokens.append( |
93 | 119 | { |
94 | 120 | "bbox": [ |
95 | | - idtx.left, |
96 | | - idtx.top, |
97 | | - idtx.left + idtx.width, |
98 | | - idtx.top + idtx.height, |
| 121 | + idtx.left / zoom, |
| 122 | + idtx.top / zoom, |
| 123 | + (idtx.left + idtx.width) / zoom, |
| 124 | + (idtx.top + idtx.height) / zoom, |
99 | 125 | ], |
100 | 126 | "text": idtx.text, |
101 | 127 | }, |
@@ -688,3 +714,21 @@ def cells_to_html(cells): |
688 | 714 | tcell.text = cell["cell text"] |
689 | 715 |
|
690 | 716 | return str(ET.tostring(table, encoding="unicode", short_empty_elements=False)) |
| 717 | + |
| 718 | + |
| 719 | +def zoom_image(image: Image, zoom: float) -> Image: |
| 720 | + """scale an image based on the zoom factor using cv2; the scaled image is post processed by |
| 721 | + dilation then erosion to improve edge sharpness for OCR tasks""" |
| 722 | + new_image = cv2.resize( |
| 723 | + cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR), |
| 724 | + None, |
| 725 | + fx=zoom, |
| 726 | + fy=zoom, |
| 727 | + interpolation=cv2.INTER_CUBIC, |
| 728 | + ) |
| 729 | + |
| 730 | + kernel = np.ones((1, 1), np.uint8) |
| 731 | + new_image = cv2.dilate(new_image, kernel, iterations=1) |
| 732 | + new_image = cv2.erode(new_image, kernel, iterations=1) |
| 733 | + |
| 734 | + return Image.fromarray(new_image) |
0 commit comments