Skip to content

Commit 3ff69cb

Browse files
authored
Merge pull request #614 from ScrapeGraphAI/screenshot-scraper-fix
Screenshot scraper fix
2 parents 13efd4e + 1e466cd commit 3ff69cb

File tree

9 files changed

+298
-9
lines changed

9 files changed

+298
-9
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ This group includes additional browser management options, such as BrowserBase.
5454
pip install scrapegraphai[more-browser-options]
5555
```
5656

57+
### Installing "More Browser Options"
58+
59+
This group includes an ocr scraper for websites
60+
```bash
61+
pip install scrapegraphai[screenshot_scraper]
62+
```
63+
5764
## 💻 Usage
5865
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
5966

174 KB
Loading
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""
2+
example of scraping with screenshots
3+
"""
4+
import asyncio
5+
from scrapegraphai.utils.screenshot_scraping import (take_screenshot,
6+
select_area_with_opencv,
7+
crop_image, detect_text)
8+
9+
# STEP 1: Take a screenshot
10+
image = asyncio.run(take_screenshot(
11+
url="https://colab.google/",
12+
save_path="Savedscreenshots/test_image.jpeg",
13+
quality = 50
14+
))
15+
16+
# STEP 2 (Optional): Select an area of the image which you want to use for text detection.
17+
LEFT, TOP, RIGHT, BOTTOM = select_area_with_opencv(image)
18+
print("LEFT: ", LEFT, " TOP: ", TOP, " RIGHT: ", RIGHT, " BOTTOM: ", BOTTOM)
19+
20+
# STEP 3 (Optional): Crop the image.
21+
# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None,
22+
# it will be set to the corresponding edge of the image.
23+
cropped_image = crop_image(image, LEFT=LEFT, RIGHT=RIGHT,TOP=TOP,BOTTOM=BOTTOM)
24+
25+
# STEP 4: Detect text
26+
TEXT = detect_text(
27+
cropped_image, # The image to detect text from
28+
languages = ["en"] # The languages to detect text in
29+
)
30+
31+
print("DETECTED TEXT: ")
32+
print(TEXT)

pyproject.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,13 @@ more-browser-options = [
8989
"browserbase>=0.3.0",
9090
]
9191

92+
# Group 4: Surya Library
93+
screenshot_scraper = [
94+
"surya-ocr>=0.5.0; python_version >= '3.10'",
95+
"matplotlib>=3.7.2; python_version >= '3.10'",
96+
"ipywidgets>=8.1.0; python_version >= '3.10'"
97+
]
98+
9299
[build-system]
93100
requires = ["hatchling"]
94101
build-backend = "hatchling.build"

requirements-dev.lock

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -159,10 +159,6 @@ idna==3.7
159159
# via yarl
160160
imagesize==1.4.1
161161
# via sphinx
162-
importlib-metadata==8.2.0
163-
# via sphinx
164-
importlib-resources==6.4.0
165-
# via matplotlib
166162
iniconfig==2.0.0
167163
# via pytest
168164
isort==5.13.2
@@ -447,10 +443,8 @@ typing-extensions==4.12.2
447443
# via pydantic
448444
# via pydantic-core
449445
# via pyee
450-
# via pylint
451446
# via sf-hamilton
452447
# via sqlalchemy
453-
# via starlette
454448
# via streamlit
455449
# via typing-inspect
456450
# via uvicorn
@@ -470,6 +464,3 @@ uvicorn==0.30.5
470464
# via burr
471465
yarl==1.9.4
472466
# via aiohttp
473-
zipp==3.20.0
474-
# via importlib-metadata
475-
# via importlib-resources

scrapegraphai/utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,6 @@
1111
from .cleanup_html import cleanup_html
1212
from .logging import *
1313
from .convert_to_md import convert_to_md
14+
from .screenshot_scraping.screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image
15+
from .screenshot_scraping.text_detection import detect_text
1416
from .token_calculator import *
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image
2+
from .text_detection import detect_text
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
"""
2+
screenshot_preparation module
3+
"""
4+
import asyncio
5+
from io import BytesIO
6+
from PIL import Image, ImageGrab
7+
from playwright.async_api import async_playwright
8+
import cv2 as cv
9+
import numpy as np
10+
from io import BytesIO
11+
12+
async def take_screenshot(url: str, save_path: str = None, quality: int = 100):
13+
"""
14+
Takes a screenshot of a webpage at the specified URL and saves it if the save_path is specified.
15+
Parameters:
16+
url (str): The URL of the webpage to take a screenshot of.
17+
save_path (str): The path to save the screenshot to. Defaults to None.
18+
quality (int): The quality of the jpeg image, between 1 and 100. Defaults to 100.
19+
Returns:
20+
PIL.Image: The screenshot of the webpage as a PIL Image object.
21+
"""
22+
23+
async with async_playwright() as p:
24+
browser = await p.chromium.launch(headless=True)
25+
page = await browser.new_page()
26+
await page.goto(url)
27+
image_bytes = await page.screenshot(path=save_path,
28+
type="jpeg",
29+
full_page=True,
30+
quality=quality)
31+
await browser.close()
32+
return Image.open(BytesIO(image_bytes))
33+
34+
def select_area_with_opencv(image):
35+
"""
36+
Allows you to manually select an image area using OpenCV.
37+
It is recommended to use this function if your project is on your computer,
38+
otherwise use select_area_with_ipywidget().
39+
Parameters:
40+
image (PIL.Image): The image from which to select an area.
41+
Returns:
42+
A tuple containing the LEFT, TOP, RIGHT, and BOTTOM coordinates of the selected area.
43+
"""
44+
45+
fullscreen_screenshot = ImageGrab.grab()
46+
dw, dh = fullscreen_screenshot.size
47+
48+
def draw_selection_rectanlge(event, x, y, flags, param):
49+
global ix, iy, drawing, overlay, img
50+
if event == cv.EVENT_LBUTTONDOWN:
51+
drawing = True
52+
ix, iy = x, y
53+
elif event == cv.EVENT_MOUSEMOVE:
54+
if drawing == True:
55+
cv.rectangle(img, (ix, iy), (x, y), (41, 215, 162), -1)
56+
cv.putText(img, 'PRESS ANY KEY TO SELECT THIS AREA', (ix,
57+
iy-10), cv.FONT_HERSHEY_SIMPLEX, 1.5, (55, 46, 252), 5)
58+
img = cv.addWeighted(overlay, alpha, img, 1 - alpha, 0)
59+
elif event == cv.EVENT_LBUTTONUP:
60+
global LEFT, TOP, RIGHT, BOTTOM
61+
62+
drawing = False
63+
if ix < x:
64+
LEFT = int(ix)
65+
RIGHT = int(x)
66+
else:
67+
LEFT = int(x)
68+
RIGHT = int(ix)
69+
if iy < y:
70+
TOP = int(iy)
71+
BOTTOM = int(y)
72+
else:
73+
TOP = int(y)
74+
BOTTOM = int(iy)
75+
76+
global drawing, ix, iy, overlay, img
77+
drawing = False
78+
ix, iy = -1, -1
79+
80+
img = np.array(image)
81+
img = cv.cvtColor(img, cv.COLOR_RGB2BGR)
82+
83+
img = cv.rectangle(
84+
img, (0, 0), (image.size[0], image.size[1]), (0, 0, 255), 10)
85+
img = cv.putText(img, 'SELECT AN AREA', (int(
86+
image.size[0]*0.3), 100), cv.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 5)
87+
88+
overlay = img.copy()
89+
alpha = 0.3
90+
91+
while True:
92+
cv.namedWindow('SELECT AREA', cv.WINDOW_KEEPRATIO)
93+
cv.setMouseCallback('SELECT AREA', draw_selection_rectanlge)
94+
cv.resizeWindow('SELECT AREA', int(
95+
image.size[0]/(image.size[1]/dh)), dh)
96+
97+
cv.imshow('SELECT AREA', img)
98+
99+
if cv.waitKey(20) > -1:
100+
break
101+
102+
cv.destroyAllWindows()
103+
return LEFT, TOP, RIGHT, BOTTOM
104+
105+
106+
def select_area_with_ipywidget(image):
107+
"""
108+
Allows you to manually select an image area using ipywidgets.
109+
It is recommended to use this function if your project is in Google Colab,
110+
Kaggle or other similar platform, otherwise use select_area_with_opencv().
111+
Parameters:
112+
image (PIL Image): The input image.
113+
Returns:
114+
None
115+
"""
116+
117+
import matplotlib.pyplot as plt
118+
import numpy as np
119+
from ipywidgets import interact, IntSlider
120+
import ipywidgets as widgets
121+
from PIL import Image
122+
123+
img_array = np.array(image)
124+
125+
print(img_array.shape)
126+
127+
def update_plot(top_bottom, left_right, image_size):
128+
plt.figure(figsize=(image_size, image_size))
129+
plt.imshow(img_array)
130+
plt.axvline(x=left_right[0], color='blue', linewidth=1)
131+
plt.text(left_right[0]+1, -25, 'LEFT', rotation=90, color='blue')
132+
plt.axvline(x=left_right[1], color='red', linewidth=1)
133+
plt.text(left_right[1]+1, -25, 'RIGHT', rotation=90, color='red')
134+
135+
plt.axhline(y=img_array.shape[0] -
136+
top_bottom[0], color='green', linewidth=1)
137+
plt.text(-100, img_array.shape[0] -
138+
top_bottom[0]+1, 'BOTTOM', color='green')
139+
plt.axhline(y=img_array.shape[0]-top_bottom[1],
140+
color='darkorange', linewidth=1)
141+
plt.text(-100, img_array.shape[0] -
142+
top_bottom[1]+1, 'TOP', color='darkorange')
143+
plt.axis('off')
144+
plt.show()
145+
146+
top_bottom_slider = widgets.IntRangeSlider(
147+
value=[int(img_array.shape[0]*0.25), int(img_array.shape[0]*0.75)],
148+
min=0,
149+
max=img_array.shape[0],
150+
step=1,
151+
description='top_bottom:',
152+
disabled=False,
153+
continuous_update=True,
154+
orientation='vertical',
155+
readout=True,
156+
readout_format='d',
157+
)
158+
159+
left_right_slider = widgets.IntRangeSlider(
160+
value=[int(img_array.shape[1]*0.25), int(img_array.shape[1]*0.75)],
161+
min=0,
162+
max=img_array.shape[1],
163+
step=1,
164+
description='left_right:',
165+
disabled=False,
166+
continuous_update=True,
167+
orientation='horizontal',
168+
readout=True,
169+
readout_format='d',
170+
)
171+
image_size_bt = widgets.BoundedIntText(
172+
value=10,
173+
min=2,
174+
max=20,
175+
step=1,
176+
description='Image size:',
177+
disabled=False
178+
)
179+
180+
interact(update_plot, top_bottom=top_bottom_slider,
181+
left_right=left_right_slider, image_size=image_size_bt)
182+
183+
return left_right_slider, top_bottom_slider
184+
185+
186+
def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path: str = None):
187+
"""
188+
Crop an image using the specified coordinates.
189+
Parameters:
190+
image (PIL.Image): The image to be cropped.
191+
LEFT (int, optional): The x-coordinate of the left edge of the crop area. Defaults to None.
192+
TOP (int, optional): The y-coordinate of the top edge of the crop area. Defaults to None.
193+
RIGHT (int, optional): The x-coordinate of
194+
the right edge of the crop area. Defaults to None.
195+
BOTTOM (int, optional): The y-coordinate of the bottom edge of the crop area. Defaults to None.
196+
save_path (str, optional): The path to save the cropped image. Defaults to None.
197+
Returns:
198+
PIL.Image: The cropped image.
199+
Notes:
200+
If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None,
201+
it will be set to the corresponding edge of the image.
202+
If save_path is specified, the cropped image will be saved as a JPEG file at the specified path.
203+
"""
204+
205+
if LEFT is None:
206+
LEFT = 0
207+
if TOP is None:
208+
TOP = 0
209+
if RIGHT is None:
210+
RIGHT = image.size[0]
211+
if BOTTOM is None:
212+
BOTTOM = image.size[1]
213+
214+
croped_image = image.crop((LEFT, TOP, RIGHT, BOTTOM))
215+
if save_path is not None:
216+
from pathlib import Path
217+
croped_image.save(save_path, "JPEG")
218+
219+
return image.crop((LEFT, TOP, RIGHT, BOTTOM))
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""
2+
text_detection_module
3+
"""
4+
from surya.ocr import run_ocr
5+
from surya.model.detection.model import (load_model as load_det_model,
6+
load_processor as load_det_processor)
7+
from surya.model.recognition.model import load_model as load_rec_model
8+
from surya.model.recognition.processor import load_processor as load_rec_processor
9+
10+
11+
def detect_text(image, languages: list = ["en"]):
12+
"""
13+
Detects and extracts text from a given image.
14+
Parameters:
15+
image (PIL Image): The input image to extract text from.
16+
lahguages (list): A list of languages to detect text in. Defaults to ["en"]. List of languages can be found here: https://github.com/VikParuchuri/surya/blob/master/surya/languages.py
17+
Returns:
18+
str: The extracted text from the image.
19+
Notes:
20+
Model weights will automatically download the first time you run this function.
21+
"""
22+
23+
langs = languages
24+
det_processor, det_model = load_det_processor(), load_det_model()
25+
rec_model, rec_processor = load_rec_model(), load_rec_processor()
26+
predictions = run_ocr([image], [langs], det_model,
27+
det_processor, rec_model, rec_processor)
28+
text = "\n".join([line.text for line in predictions[0].text_lines])
29+
return text

0 commit comments

Comments
 (0)