ScrapeGraphAI
diff --git a/‎CHANGELOG.md‎
Lines changed: 31 additions & 2 deletions b/‎CHANGELOG.md‎
Lines changed: 31 additions & 2 deletions
diff --git a/‎examples/extras/authenticated_playwright.py‎
Lines changed: 93 additions & 0 deletions b/‎examples/extras/authenticated_playwright.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎examples/extras/undected_playwrigth.py‎ renamed to ‎examples/extras/undected_playwright.py‎ b/‎examples/extras/undected_playwrigth.py‎ renamed to ‎examples/extras/undected_playwright.py‎
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎scrapegraphai/docloaders/chromium.py‎
Lines changed: 23 additions & 7 deletions b/‎scrapegraphai/docloaders/chromium.py‎
Lines changed: 23 additions & 7 deletions
@@ -1,9 +1,38 @@
-## [1.32.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1...v1.32.0) (2024-12-02)
+## [1.33.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0...v1.33.0-beta.1) (2024-12-05)
 
 
 ### Features
 
-* add API integration ([46373af](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/46373afe6d8c05ad26039e68190f13d82b20a349))
+* add api integration ([8aa9103](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8aa9103f02af92d9e1a780450daa7bb303afc150))
+* add API integration ([ba6e931](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ba6e931caf5f3d4a3b9c31ec4655fe7a9f0e214c))
+* add sdk integration ([209b445](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/209b4456fd668d9d124fd5586b32a4be677d4bf8))
+* revert search function ([faf0c01](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/faf0c0123b5e2e548cbd1917e9d1df22e1edb1c5))
+
+
+### Bug Fixes
+
+* error on fetching the code ([7285ab0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7285ab065bba9099ba2751c9d2f21ee13fed0d5f))
+* improved links extraction for parse_node, resolves [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822) ([7da7bfe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7da7bfe338a6ce53c83361a1f6cd9ea2d5bd797f))
+
+
+### chore
+
+* migrate from rye to uv ([5fe528a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5fe528a7e7a3e230d8f68fd83ce5ad6ede5adfef))
+
+
+### CI
+
+* **release:** 1.32.0-beta.1 [skip ci] ([b98dd39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b98dd39150947fb121cd726d343c9d6fb9a31d5f))
+* **release:** 1.32.0-beta.2 [skip ci] ([8b17764](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8b17764a53c4e16c7c0178925f9275282f5dba3c))
+* **release:** 1.32.0-beta.3 [skip ci] ([0769fce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0769fce7d501692bd1135d6337b0aea4a397c8f1)), closes [#822](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/822)
+* **release:** 1.32.0-beta.4 [skip ci] ([67c9859](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/67c9859c2078e7ec3b3ac99827deb346860f1a83))
+* **release:** 1.32.0-beta.5 [skip ci] ([fbb4252](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fbb42526320cd614684fe1092cac89cde86c27d4))
+
+## [1.32.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0-beta.4...v1.32.0-beta.5) (2024-12-02)
+## [1.32.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1...v1.32.0) (2024-12-02)
+
+
+
 
 ## [1.32.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0-beta.3...v1.32.0-beta.4) (2024-12-02)
 
 
@@ -0,0 +1,93 @@
+"""
+Example leveraging a state file containing session cookies which
+might be leveraged to authenticate to a website and scrape protected
+content.
+"""
+
+import os
+import random
+from dotenv import load_dotenv
+
+# import playwright so we can use it to create the state file
+from playwright.async_api import async_playwright
+
+from scrapegraphai.graphs import OmniScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Leveraging Playwright external to the invocation of the graph to
+# login and create the state file
+# ************************************************
+
+
+# note this is just an example and probably won't actually work on
+# LinkedIn, the implementation of the login is highly dependent on the website
+async def do_login():
+    async with async_playwright() as playwright:
+        browser = await playwright.chromium.launch(
+            timeout=30000,
+            headless=False,
+            slow_mo=random.uniform(500, 1500),
+        )
+    page = await browser.new_page()
+
+    # very basic implementation of a login, in reality it may be trickier
+    await page.goto("https://www.linkedin.com/login")
+    await page.get_by_label("Email or phone").fill("some_bloke@some_domain.com")
+    await page.get_by_label("Password").fill("test1234")
+    await page.get_by_role("button", name="Sign in").click()
+    await page.wait_for_timeout(3000)
+
+    # assuming a successful login, we save the cookies to a file
+    await page.context.storage_state(path="./state.json")
+
+
+async def main():
+    await do_login()
+
+    # ************************************************
+    # Define the configuration for the graph
+    # ************************************************
+
+    openai_api_key = os.getenv("OPENAI_APIKEY")
+
+    graph_config = {
+        "llm": {
+            "api_key": openai_api_key,
+            "model": "openai/gpt-4o",
+        },
+        "max_images": 10,
+        "headless": False,
+        # provide the path to the state file
+        "storage_state": "./state.json",
+    }
+
+    # ************************************************
+    # Create the OmniScraperGraph instance and run it
+    # ************************************************
+
+    omni_scraper_graph = OmniScraperGraph(
+        prompt="List me all the projects with their description.",
+        source="https://www.linkedin.com/feed/",
+        config=graph_config,
+    )
+
+    # the storage_state is used to load the cookies from the state file
+    # so we are authenticated and able to scrape protected content
+    result = omni_scraper_graph.run()
+    print(result)
+
+    # ************************************************
+    # Get graph execution info
+    # ************************************************
+
+    graph_exec_info = omni_scraper_graph.get_execution_info()
+    print(prettify_exec_info(graph_exec_info))
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
@@ -3,7 +3,8 @@ name = "scrapegraphai"
 
 
 
-version = "1.32.0"
+version = "1.33.0b1"
+
 
 
 
 
@@ -9,6 +9,7 @@
 
 logger = get_logger("web-loader")
 
+
 class ChromiumLoader(BaseLoader):
     """Scrapes HTML pages from URLs using a (headless) instance of the
     Chromium web driver with proxy protection.
@@ -34,6 +35,7 @@ def __init__(
         proxy: Optional[Proxy] = None,
         load_state: str = "domcontentloaded",
         requires_js_support: bool = False,
+        storage_state: Optional[str] = None,
         **kwargs: Any,
     ):
         """Initialize the loader with a list of URL paths.
@@ -63,6 +65,7 @@ def __init__(
         self.urls = urls
         self.load_state = load_state
         self.requires_js_support = requires_js_support
+        self.storage_state = storage_state
 
     async def ascrape_undetected_chromedriver(self, url: str) -> str:
         """
@@ -92,7 +95,9 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
                 attempt += 1
                 logger.error(f"Attempt {attempt} failed: {e}")
                 if attempt == self.RETRY_LIMIT:
-                    results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+                    results = (
+                        f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+                    )
             finally:
                 driver.quit()
 
@@ -244,7 +249,9 @@ async def ascrape_playwright(self, url: str) -> str:
                     browser = await p.chromium.launch(
                         headless=self.headless, proxy=self.proxy, **self.browser_config
                     )
-                    context = await browser.new_context()
+                    context = await browser.new_context(
+                        storage_state=self.storage_state
+                    )
                     await Malenia.apply_stealth(context)
                     page = await context.new_page()
                     await page.goto(url, wait_until="domcontentloaded")
@@ -262,6 +269,7 @@ async def ascrape_playwright(self, url: str) -> str:
 
         return results
 
+
     async def ascrape_with_js_support(self, url: str) -> str:
         """
         Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
@@ -270,7 +278,7 @@ async def ascrape_with_js_support(self, url: str) -> str:
             url (str): The URL to scrape.
 
         Returns:
-            str: The fully rendered HTML content after JavaScript execution, 
+            str: The fully rendered HTML content after JavaScript execution,
             or an error message if an exception occurs.
         """
         from playwright.async_api import async_playwright
@@ -285,7 +293,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
                     browser = await p.chromium.launch(
                         headless=self.headless, proxy=self.proxy, **self.browser_config
                     )
-                    context = await browser.new_context()
+                    context = await browser.new_context(
+                        storage_state=self.storage_state
+                    )
                     page = await context.new_page()
                     await page.goto(url, wait_until="networkidle")
                     results = await page.content()
@@ -295,7 +305,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
                 attempt += 1
                 logger.error(f"Attempt {attempt} failed: {e}")
                 if attempt == self.RETRY_LIMIT:
-                    results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+                    results = (
+                        f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+                    )
             finally:
                 await browser.close()
 
@@ -312,7 +324,9 @@ def lazy_load(self) -> Iterator[Document]:
             Document: The scraped content encapsulated within a Document object.
         """
         scraping_fn = (
-            self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
+            self.ascrape_with_js_support
+            if self.requires_js_support
+            else getattr(self, f"ascrape_{self.backend}")
         )
 
         for url in self.urls:
@@ -334,7 +348,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
             source URL as metadata.
         """
         scraping_fn = (
-            self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
+            self.ascrape_with_js_support
+            if self.requires_js_support
+            else getattr(self, f"ascrape_{self.backend}")
         )
 
         tasks = [scraping_fn(url) for url in self.urls]
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,8 @@ name = "scrapegraphai"`
`3`	`3`
`4`	`4`
`5`	`5`
`6`		`-version = "1.32.0"`
	`6`	`+version = "1.33.0b1"`
	`7`	`+`
`7`	`8`
`8`	`9`
`9`	`10`