Skip to content

Commit 21e89d7

Browse files
update
1 parent 509adca commit 21e89d7

File tree

1 file changed

+29
-59
lines changed

1 file changed

+29
-59
lines changed

src/bisheng_unstructured/models/idp/dummy_ocr_agent.py

Lines changed: 29 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,24 @@ def process_paragraph(bboxes, texts, rect_box):
106106

107107
return up_table, down_table
108108

109+
def process_whole_paragraph(general_ocr_res):
110+
boxes = general_ocr_res["bboxes"]
111+
texts = general_ocr_res["texts"]
112+
rowcol = general_ocr_res["row_col_info"]
113+
114+
max_row = max(row[0] for row in rowcol) + 1
115+
max_col = max(row[1] for row in rowcol) + 1
116+
117+
layout_text = [["" for _ in range(max_col)] for _ in range(max_row)]
118+
layout_boxs = [[[] for _ in range(max_col)] for _ in range(max_row)]
119+
120+
# 填充layout
121+
for (row, col), txt, box in zip(rowcol, texts, boxes):
122+
layout_text[row][col] = txt
123+
layout_boxs[row][col] = box
124+
return layout_text, layout_boxs
125+
126+
109127
class OCRAgent(object):
110128
def __init__(self, **kwargs):
111129
self.ep = kwargs.get("ocr_model_ep")
@@ -122,67 +140,19 @@ def predict(self, inp) -> List[BlockInfo]:
122140
req_data = {"param": params, "data": [b64_image]}
123141
try:
124142
r = self.client.post(url=self.ep, json=req_data, timeout=self.timeout)
125-
# ret = convert_json(r.json())
126-
# return ret
143+
127144
except requests.exceptions.Timeout:
128145
raise Exception(f"timeout in formula agent predict")
129146
except Exception as e:
130147
raise Exception(f"exception in formula agent predict: [{e}]")
131148

132-
table_rect_box = []
133-
table_md_str = ""
134-
if 'table_result' in r["data"]["json"] and len(r["data"]["json"]["table_result"][0]["cell_infos"]):
135-
table_result = r["data"]["json"]["table_result"][0]["cell_infos"]
136-
table_md_str,table_rect_box = process_table(table_result)
137-
138-
bboxes = r["data"]["json"]["general_ocr_res"]["bboxes"]
139-
texts = r["data"]["json"]["general_ocr_res"]["texts"]
140-
141-
up_table, down_table = process_paragraph(bboxes, texts, table_rect_box)
142-
res = list()
143-
if len(table_rect_box):
144-
b1 = BlockInfo(
145-
block=[],
146-
block_text=table_md_str,
147-
block_no=0,
148-
ts=[""],
149-
rs=[table_rect_box],
150-
layout_type=1,
151-
)
152-
if len(up_table["boxs"]):
153-
text = "".join(up_table['texts'])
154-
box = recalculate_xy(up_table['boxs'])
155-
res.append(BlockInfo(
156-
block=[],
157-
block_text=text,
158-
block_no=0,
159-
ts=[text],
160-
rs=[box],
161-
layout_type=0,
162-
))
163-
res.append(b1)
164-
if len(down_table["boxs"]):
165-
text = "".join(down_table['texts'])
166-
box = recalculate_xy(down_table['boxs'])
167-
res.append(BlockInfo(
168-
block=[],
169-
block_text=text,
170-
block_no=0,
171-
ts=[text],
172-
rs=[box],
173-
layout_type=0,
174-
))
175-
return res
176-
else:
177-
text = "".join(up_table['texts'])
178-
box = recalculate_xy(up_table['boxs'])
179-
b0 = BlockInfo(
180-
block=[],
181-
block_text="abcdef",
182-
block_no=0,
183-
ts=["abc", "def"],
184-
rs=[[0, 0, 100, 30], [0, 50, 100, 80]],
185-
layout_type=0,
186-
)
187-
return [b0]
188-
149+
layout_text, layout_boxs = process_whole_paragraph(r["data"]["json"]["general_ocr_res"])
150+
b0 = BlockInfo(
151+
block=[],
152+
block_text=''.join([''.join(text) for text in layout_text]),
153+
block_no=0,
154+
ts=[''.join(text) for text in layout_text],
155+
rs=[''.join(text) for text in layout_boxs],
156+
layout_type=0,
157+
)
158+
return [b0]

0 commit comments

Comments
 (0)