11from __future__ import annotations
22
33import os
4+ import re
45from typing import TYPE_CHECKING , List
56
67import cv2
78import numpy as np
89import pandas as pd
910import unstructured_pytesseract
11+ from bs4 import BeautifulSoup , Tag
1012from PIL import Image as PILImage
11- from unstructured_pytesseract import Output
1213
1314from unstructured .logger import trace_logger
1415from unstructured .partition .utils .config import env_config
@@ -47,10 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
4748
4849 trace_logger .detail ("Processing entire page OCR with tesseract..." )
4950 zoom = 1
50- ocr_df : pd .DataFrame = unstructured_pytesseract . image_to_data (
51+ ocr_df : pd .DataFrame = self . image_to_data_with_character_confidence_filter (
5152 np .array (image ),
5253 lang = self .language ,
53- output_type = Output . DATAFRAME ,
54+ character_confidence_threshold = env_config . TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD ,
5455 )
5556 ocr_df = ocr_df .dropna ()
5657
@@ -76,17 +77,94 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
7677 np .round (env_config .TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height , 1 ),
7778 max_zoom ,
7879 )
79- ocr_df = unstructured_pytesseract . image_to_data (
80+ ocr_df = self . image_to_data_with_character_confidence_filter (
8081 np .array (zoom_image (image , zoom )),
8182 lang = self .language ,
82- output_type = Output . DATAFRAME ,
83+ character_confidence_threshold = env_config . TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD ,
8384 )
8485 ocr_df = ocr_df .dropna ()
85-
8686 ocr_regions = self .parse_data (ocr_df , zoom = zoom )
8787
8888 return ocr_regions
8989
90+ def image_to_data_with_character_confidence_filter (
91+ self ,
92+ image : np .ndarray ,
93+ lang : str = "eng" ,
94+ config : str = "" ,
95+ character_confidence_threshold : float = 0.0 ,
96+ ) -> pd .DataFrame :
97+ hocr : str = unstructured_pytesseract .image_to_pdf_or_hocr (
98+ image ,
99+ lang = lang ,
100+ config = "-c hocr_char_boxes=1 " + config ,
101+ extension = "hocr" ,
102+ )
103+ ocr_df = self .hocr_to_dataframe (hocr , character_confidence_threshold )
104+ return ocr_df
105+
106+ def hocr_to_dataframe (
107+ self , hocr : str , character_confidence_threshold : float = 0.0
108+ ) -> pd .DataFrame :
109+ soup = BeautifulSoup (hocr , "html.parser" )
110+ word_spans = soup .find_all ("span" , class_ = "ocrx_word" )
111+
112+ df_entries = []
113+ for word_span in word_spans :
114+ word_title = word_span .get ("title" , "" )
115+ bbox_match = re .search (r"bbox (\d+) (\d+) (\d+) (\d+)" , word_title )
116+
117+ # Note: word bbox is used instead of combining characters together due to tesseract
118+ # bug that causes the character bboxes to be outside the word bbox, and they have 0
119+ # height or width when text is horizontal
120+ text = self .extract_word_from_hocr (
121+ word = word_span , character_confidence_threshold = character_confidence_threshold
122+ )
123+ if text and bbox_match :
124+ word_bbox = list (map (int , bbox_match .groups ()))
125+ left , top , right , bottom = word_bbox
126+ df_entries .append (
127+ {
128+ "left" : left ,
129+ "top" : top ,
130+ "right" : right ,
131+ "bottom" : bottom ,
132+ "text" : text ,
133+ }
134+ )
135+ ocr_df = pd .DataFrame (df_entries , columns = ["left" , "top" , "right" , "bottom" , "text" ])
136+
137+ ocr_df ["width" ] = ocr_df ["right" ] - ocr_df ["left" ]
138+ ocr_df ["height" ] = ocr_df ["bottom" ] - ocr_df ["top" ]
139+
140+ ocr_df = ocr_df .drop (columns = ["right" , "bottom" ])
141+ return ocr_df
142+
143+ @staticmethod
144+ def extract_word_from_hocr (word : Tag , character_confidence_threshold : float = 0.0 ) -> str :
145+ """Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
146+
147+ character_spans = word .find_all ("span" , class_ = "ocrx_cinfo" )
148+ if len (character_spans ) == 0 :
149+ return ""
150+
151+ word_text = ""
152+ for character_span in character_spans :
153+ char = character_span .text
154+
155+ char_title = character_span .get ("title" , "" )
156+ conf_match = re .search (r"x_conf (\d+\.\d+)" , char_title )
157+
158+ if not (char and conf_match ):
159+ continue
160+
161+ character_probability = float (conf_match .group (1 )) / 100
162+
163+ if character_probability >= character_confidence_threshold :
164+ word_text += char
165+
166+ return word_text
167+
90168 @requires_dependencies ("unstructured_inference" )
91169 def get_layout_elements_from_image (self , image : PILImage .Image ) -> List ["LayoutElement" ]:
92170 from unstructured .partition .pdf_image .inference_utils import (
0 commit comments