Skip to content

Commit c396dcf

Browse files
authored
Merge branch 'pre/beta' into feat_chromium_scroller
2 parents 7eeca1b + 60e2fdf commit c396dcf

18 files changed

+614
-330
lines changed

CHANGELOG.md

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,38 @@
1-
## [1.32.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1...v1.32.0) (2024-12-02)
1+
## [1.33.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0...v1.33.0-beta.1) (2024-12-05)
22

33

44
### Features
55

6-
* add API integration ([46373af](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/46373afe6d8c05ad26039e68190f13d82b20a349))
6+
* add api integration ([8aa9103](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8aa9103f02af92d9e1a780450daa7bb303afc150))
7+
* add API integration ([ba6e931](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ba6e931caf5f3d4a3b9c31ec4655fe7a9f0e214c))
8+
* add sdk integration ([209b445](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/209b4456fd668d9d124fd5586b32a4be677d4bf8))
9+
* revert search function ([faf0c01](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/faf0c0123b5e2e548cbd1917e9d1df22e1edb1c5))
10+
11+
12+
### Bug Fixes
13+
14+
* error on fetching the code ([7285ab0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7285ab065bba9099ba2751c9d2f21ee13fed0d5f))
15+
* improved links extraction for parse_node, resolves [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) ([7da7bfe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7da7bfe338a6ce53c83361a1f6cd9ea2d5bd797f))
16+
17+
18+
### chore
19+
20+
* migrate from rye to uv ([5fe528a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5fe528a7e7a3e230d8f68fd83ce5ad6ede5adfef))
21+
22+
23+
### CI
24+
25+
* **release:** 1.32.0-beta.1 [skip ci] ([b98dd39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b98dd39150947fb121cd726d343c9d6fb9a31d5f))
26+
* **release:** 1.32.0-beta.2 [skip ci] ([8b17764](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8b17764a53c4e16c7c0178925f9275282f5dba3c))
27+
* **release:** 1.32.0-beta.3 [skip ci] ([0769fce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0769fce7d501692bd1135d6337b0aea4a397c8f1)), closes [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822)
28+
* **release:** 1.32.0-beta.4 [skip ci] ([67c9859](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/67c9859c2078e7ec3b3ac99827deb346860f1a83))
29+
* **release:** 1.32.0-beta.5 [skip ci] ([fbb4252](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fbb42526320cd614684fe1092cac89cde86c27d4))
30+
31+
## [1.32.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0-beta.4...v1.32.0-beta.5) (2024-12-02)
32+
## [1.32.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1...v1.32.0) (2024-12-02)
33+
34+
35+
736

837
## [1.32.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0-beta.3...v1.32.0-beta.4) (2024-12-02)
938

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""
2+
Example leveraging a state file containing session cookies which
3+
might be leveraged to authenticate to a website and scrape protected
4+
content.
5+
"""
6+
7+
import os
8+
import random
9+
from dotenv import load_dotenv
10+
11+
# import playwright so we can use it to create the state file
12+
from playwright.async_api import async_playwright
13+
14+
from scrapegraphai.graphs import OmniScraperGraph
15+
from scrapegraphai.utils import prettify_exec_info
16+
17+
load_dotenv()
18+
19+
# ************************************************
20+
# Leveraging Playwright external to the invocation of the graph to
21+
# login and create the state file
22+
# ************************************************
23+
24+
25+
# note this is just an example and probably won't actually work on
26+
# LinkedIn, the implementation of the login is highly dependent on the website
27+
async def do_login():
28+
async with async_playwright() as playwright:
29+
browser = await playwright.chromium.launch(
30+
timeout=30000,
31+
headless=False,
32+
slow_mo=random.uniform(500, 1500),
33+
)
34+
page = await browser.new_page()
35+
36+
# very basic implementation of a login, in reality it may be trickier
37+
await page.goto("https://www.linkedin.com/login")
38+
await page.get_by_label("Email or phone").fill("some_bloke@some_domain.com")
39+
await page.get_by_label("Password").fill("test1234")
40+
await page.get_by_role("button", name="Sign in").click()
41+
await page.wait_for_timeout(3000)
42+
43+
# assuming a successful login, we save the cookies to a file
44+
await page.context.storage_state(path="./state.json")
45+
46+
47+
async def main():
48+
await do_login()
49+
50+
# ************************************************
51+
# Define the configuration for the graph
52+
# ************************************************
53+
54+
openai_api_key = os.getenv("OPENAI_APIKEY")
55+
56+
graph_config = {
57+
"llm": {
58+
"api_key": openai_api_key,
59+
"model": "openai/gpt-4o",
60+
},
61+
"max_images": 10,
62+
"headless": False,
63+
# provide the path to the state file
64+
"storage_state": "./state.json",
65+
}
66+
67+
# ************************************************
68+
# Create the OmniScraperGraph instance and run it
69+
# ************************************************
70+
71+
omni_scraper_graph = OmniScraperGraph(
72+
prompt="List me all the projects with their description.",
73+
source="https://www.linkedin.com/feed/",
74+
config=graph_config,
75+
)
76+
77+
# the storage_state is used to load the cookies from the state file
78+
# so we are authenticated and able to scrape protected content
79+
result = omni_scraper_graph.run()
80+
print(result)
81+
82+
# ************************************************
83+
# Get graph execution info
84+
# ************************************************
85+
86+
graph_exec_info = omni_scraper_graph.get_execution_info()
87+
print(prettify_exec_info(graph_exec_info))
88+
89+
90+
if __name__ == "__main__":
91+
import asyncio
92+
93+
asyncio.run(main())
File renamed without changes.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ name = "scrapegraphai"
33

44

55

6-
version = "1.32.0"
6+
version = "1.33.0b1"
7+
78

89

910

scrapegraphai/docloaders/chromium.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
logger = get_logger("web-loader")
1111

12+
1213
class ChromiumLoader(BaseLoader):
1314
"""Scrapes HTML pages from URLs using a (headless) instance of the
1415
Chromium web driver with proxy protection.
@@ -34,6 +35,7 @@ def __init__(
3435
proxy: Optional[Proxy] = None,
3536
load_state: str = "domcontentloaded",
3637
requires_js_support: bool = False,
38+
storage_state: Optional[str] = None,
3739
**kwargs: Any,
3840
):
3941
"""Initialize the loader with a list of URL paths.
@@ -63,6 +65,7 @@ def __init__(
6365
self.urls = urls
6466
self.load_state = load_state
6567
self.requires_js_support = requires_js_support
68+
self.storage_state = storage_state
6669

6770
async def ascrape_undetected_chromedriver(self, url: str) -> str:
6871
"""
@@ -92,7 +95,9 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
9295
attempt += 1
9396
logger.error(f"Attempt {attempt} failed: {e}")
9497
if attempt == self.RETRY_LIMIT:
95-
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
98+
results = (
99+
f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
100+
)
96101
finally:
97102
driver.quit()
98103

@@ -244,7 +249,9 @@ async def ascrape_playwright(self, url: str) -> str:
244249
browser = await p.chromium.launch(
245250
headless=self.headless, proxy=self.proxy, **self.browser_config
246251
)
247-
context = await browser.new_context()
252+
context = await browser.new_context(
253+
storage_state=self.storage_state
254+
)
248255
await Malenia.apply_stealth(context)
249256
page = await context.new_page()
250257
await page.goto(url, wait_until="domcontentloaded")
@@ -262,6 +269,7 @@ async def ascrape_playwright(self, url: str) -> str:
262269

263270
return results
264271

272+
265273
async def ascrape_with_js_support(self, url: str) -> str:
266274
"""
267275
Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
@@ -270,7 +278,7 @@ async def ascrape_with_js_support(self, url: str) -> str:
270278
url (str): The URL to scrape.
271279
272280
Returns:
273-
str: The fully rendered HTML content after JavaScript execution,
281+
str: The fully rendered HTML content after JavaScript execution,
274282
or an error message if an exception occurs.
275283
"""
276284
from playwright.async_api import async_playwright
@@ -285,7 +293,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
285293
browser = await p.chromium.launch(
286294
headless=self.headless, proxy=self.proxy, **self.browser_config
287295
)
288-
context = await browser.new_context()
296+
context = await browser.new_context(
297+
storage_state=self.storage_state
298+
)
289299
page = await context.new_page()
290300
await page.goto(url, wait_until="networkidle")
291301
results = await page.content()
@@ -295,7 +305,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
295305
attempt += 1
296306
logger.error(f"Attempt {attempt} failed: {e}")
297307
if attempt == self.RETRY_LIMIT:
298-
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
308+
results = (
309+
f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
310+
)
299311
finally:
300312
await browser.close()
301313

@@ -312,7 +324,9 @@ def lazy_load(self) -> Iterator[Document]:
312324
Document: The scraped content encapsulated within a Document object.
313325
"""
314326
scraping_fn = (
315-
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
327+
self.ascrape_with_js_support
328+
if self.requires_js_support
329+
else getattr(self, f"ascrape_{self.backend}")
316330
)
317331

318332
for url in self.urls:
@@ -334,7 +348,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
334348
source URL as metadata.
335349
"""
336350
scraping_fn = (
337-
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
351+
self.ascrape_with_js_support
352+
if self.requires_js_support
353+
else getattr(self, f"ascrape_{self.backend}")
338354
)
339355

340356
tasks = [scraping_fn(url) for url in self.urls]

0 commit comments

Comments
 (0)