@@ -55,60 +55,71 @@ def initialize(
5555 )
5656 self .model .to (device )
5757
58+ def get_tokens (self , x : Image ):
59+ """Get OCR tokens from either paddleocr or tesseract"""
60+ if platform .machine () == "x86_64" :
61+ try :
62+ from unstructured_inference .models import paddle_ocr
63+
64+ paddle_result = paddle_ocr .load_agent ().ocr (np .array (x ), cls = True )
65+
66+ tokens = []
67+ for idx in range (len (paddle_result )):
68+ res = paddle_result [idx ]
69+ for line in res :
70+ xmin = min ([i [0 ] for i in line [0 ]])
71+ ymin = min ([i [1 ] for i in line [0 ]])
72+ xmax = max ([i [0 ] for i in line [0 ]])
73+ ymax = max ([i [1 ] for i in line [0 ]])
74+ tokens .append ({"bbox" : [xmin , ymin , xmax , ymax ], "text" : line [1 ][0 ]})
75+ return tokens
76+ except ModuleNotFoundError :
77+ logging .warning (
78+ "No module named 'unstructured_paddleocr', falling back to tesseract" ,
79+ )
80+ pass
81+ zoom = 6
82+ img = cv2 .resize (
83+ cv2 .cvtColor (np .array (x ), cv2 .COLOR_RGB2BGR ),
84+ None ,
85+ fx = zoom ,
86+ fy = zoom ,
87+ interpolation = cv2 .INTER_CUBIC ,
88+ )
89+
90+ kernel = np .ones ((1 , 1 ), np .uint8 )
91+ img = cv2 .dilate (img , kernel , iterations = 1 )
92+ img = cv2 .erode (img , kernel , iterations = 1 )
93+
94+ ocr_df : pd .DataFrame = pytesseract .image_to_data (
95+ Image .fromarray (img ),
96+ output_type = "data.frame" ,
97+ )
98+
99+ ocr_df = ocr_df .dropna ()
100+
101+ tokens = []
102+ for idtx in ocr_df .itertuples ():
103+ tokens .append (
104+ {
105+ "bbox" : [
106+ idtx .left / zoom ,
107+ idtx .top / zoom ,
108+ (idtx .left + idtx .width ) / zoom ,
109+ (idtx .top + idtx .height ) / zoom ,
110+ ],
111+ "text" : idtx .text ,
112+ },
113+ )
114+ return tokens
115+
58116 def run_prediction (self , x : Image ):
59117 """Predict table structure"""
60118 with torch .no_grad ():
61119 encoding = self .feature_extractor (x , return_tensors = "pt" ).to (self .device )
62120 outputs_structure = self .model (** encoding )
63121
64- if platform .machine () == "x86_64" :
65- from unstructured_inference .models import paddle_ocr
66-
67- paddle_result = paddle_ocr .load_agent ().ocr (np .array (x ), cls = True )
68-
69- tokens = []
70- for idx in range (len (paddle_result )):
71- res = paddle_result [idx ]
72- for line in res :
73- xmin = min ([i [0 ] for i in line [0 ]])
74- ymin = min ([i [1 ] for i in line [0 ]])
75- xmax = max ([i [0 ] for i in line [0 ]])
76- ymax = max ([i [1 ] for i in line [0 ]])
77- tokens .append ({"bbox" : [xmin , ymin , xmax , ymax ], "text" : line [1 ][0 ]})
78- else :
79- zoom = 6
80- img = cv2 .resize (
81- cv2 .cvtColor (np .array (x ), cv2 .COLOR_RGB2BGR ),
82- None ,
83- fx = zoom ,
84- fy = zoom ,
85- interpolation = cv2 .INTER_CUBIC ,
86- )
87-
88- kernel = np .ones ((1 , 1 ), np .uint8 )
89- img = cv2 .dilate (img , kernel , iterations = 1 )
90- img = cv2 .erode (img , kernel , iterations = 1 )
91-
92- ocr_df : pd .DataFrame = pytesseract .image_to_data (
93- Image .fromarray (img ),
94- output_type = "data.frame" ,
95- )
96-
97- ocr_df = ocr_df .dropna ()
98-
99- tokens = []
100- for idtx in ocr_df .itertuples ():
101- tokens .append (
102- {
103- "bbox" : [
104- idtx .left / zoom ,
105- idtx .top / zoom ,
106- (idtx .left + idtx .width ) / zoom ,
107- (idtx .top + idtx .height ) / zoom ,
108- ],
109- "text" : idtx .text ,
110- },
111- )
122+ tokens = self .get_tokens (x = x )
112123
113124 sorted (tokens , key = lambda x : x ["bbox" ][1 ] * 10000 + x ["bbox" ][0 ])
114125
0 commit comments