Skip to content

Commit 169dd20

Browse files
authored
Merge branch 'master' into feat/deploy
2 parents 54b8b47 + d1b39a2 commit 169dd20

File tree

9 files changed

+149
-149
lines changed

9 files changed

+149
-149
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ __pycache__
77
.env
88
.env.*
99
venv/
10-
*.pem
10+
*.pem

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
**OmniParser** is a comprehensive method for parsing user interface screenshots into structured and easy-to-understand elements, which significantly enhances the ability of GPT-4V to generate actions that can be accurately grounded in the corresponding regions of the interface.
1313

1414
## News
15+
- [2024/10] Feel free to checkout our demo on [huggingface space](https://huggingface.co/spaces/microsoft/OmniParser)! (stay tuned for OmniParser + Claude Computer Use)
1516
- [2024/10] Both Interactive Region Detection Model and Icon functional description model are released! [Hugginface models](https://huggingface.co/microsoft/OmniParser)
1617
- [2024/09] OmniParser achieves the best performance on [Windows Agent Arena](https://microsoft.github.io/WindowsAgentArena/)!
1718

__pycache__/utils.cpython-312.pyc

629 Bytes
Binary file not shown.

__pycache__/utils.cpython-39.pyc

-19.2 KB
Binary file not shown.

demo.ipynb

Lines changed: 114 additions & 135 deletions
Large diffs are not rendered by default.

gradio_demo.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,14 @@ def default(self, obj):
6363
def process(
6464
image_input,
6565
box_threshold,
66-
iou_threshold
67-
) -> Tuple[Optional[Image.Image], Text]:
66+
iou_threshold,
67+
use_paddleocr
68+
) -> Optional[Image.Image]:
6869

6970
image_save_path = 'imgs/saved_image_demo.png'
7071
image_input.save(image_save_path)
7172

72-
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9})
73+
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=use_paddleocr)
7374
text, ocr_bbox = ocr_bbox_rslt
7475

7576
dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_save_path, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold)
@@ -103,6 +104,8 @@ def process(
103104
# set the threshold for removing the bounding boxes with large overlap, default is 0.1
104105
iou_threshold_component = gr.Slider(
105106
label='IOU Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.1)
107+
use_paddleocr_component = gr.Checkbox(
108+
label='Use PaddleOCR', value=True)
106109
submit_button_component = gr.Button(
107110
value='Submit', variant='primary')
108111
with gr.Column():
@@ -114,7 +117,8 @@ def process(
114117
inputs=[
115118
image_input_component,
116119
box_threshold_component,
117-
iou_threshold_component
120+
iou_threshold_component,
121+
use_paddleocr_component
118122
],
119123
outputs=[image_output_component, text_output_component]
120124
)

imgs/saved_image_demo.png

-122 KB
Loading

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,5 @@ dill
1414
accelerate
1515
timm
1616
einops==0.8.0
17+
paddlepaddle
18+
paddleocr

utils.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,17 @@
1818
# %matplotlib inline
1919
from matplotlib import pyplot as plt
2020
import easyocr
21+
from paddleocr import PaddleOCR
2122
reader = easyocr.Reader(['en'])
23+
paddle_ocr = PaddleOCR(
24+
lang='en', # other lang also available
25+
use_angle_cls=False,
26+
use_gpu=False, # using cuda will conflict with pytorch in the same process
27+
show_log=False,
28+
max_batch_size=1024,
29+
use_dilation=True, # improves accuracy
30+
det_db_score_mode='slow', # improves accuracy
31+
rec_batch_num=1024)
2232
import time
2333
import base64
2434

@@ -370,14 +380,18 @@ def get_xywh_yolo(input):
370380

371381

372382

373-
def check_ocr_box(image_path, display_img = True, output_bb_format='xywh', goal_filtering=None, easyocr_args=None):
374-
if easyocr_args is None:
375-
easyocr_args = {}
376-
result = reader.readtext(image_path, **easyocr_args)
377-
is_goal_filtered = False
378-
# print('goal filtering pred:', result[-5:])
379-
coord = [item[0] for item in result]
380-
text = [item[1] for item in result]
383+
def check_ocr_box(image_path, display_img = True, output_bb_format='xywh', goal_filtering=None, easyocr_args=None, use_paddleocr=False):
384+
if use_paddleocr:
385+
result = paddle_ocr.ocr(image_path, cls=False)[0]
386+
coord = [item[0] for item in result]
387+
text = [item[1][0] for item in result]
388+
else: # EasyOCR
389+
if easyocr_args is None:
390+
easyocr_args = {}
391+
result = reader.readtext(image_path, **easyocr_args)
392+
# print('goal filtering pred:', result[-5:])
393+
coord = [item[0] for item in result]
394+
text = [item[1] for item in result]
381395
# read the image using cv2
382396
if display_img:
383397
opencv_img = cv2.imread(image_path)
@@ -397,7 +411,7 @@ def check_ocr_box(image_path, display_img = True, output_bb_format='xywh', goal_
397411
elif output_bb_format == 'xyxy':
398412
bb = [get_xyxy(item) for item in coord]
399413
# print('bounding box!!!', bb)
400-
return (text, bb), is_goal_filtered
414+
return (text, bb), goal_filtering
401415

402416

403417

0 commit comments

Comments
 (0)