Resolved merge conflict in scrapegraphai/helpers/models_tokens.py

SwapnilSonker · SwapnilSonker · commit 84548d081d89 · 2024-12-16T20:45:35.000+05:30
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,45 +1,3 @@
-## [1.33.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.7...v1.33.8) (2024-12-16)
-
-
-### Bug Fixes
-
-* pyproject ([76ac0a2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/76ac0a2141d9d53af023a405e2c61849921e4f0e))
-
-## [1.33.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.6...v1.33.7) (2024-12-16)
-
-
-### Bug Fixes
-
-* pyproject ([3dcfcd4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3dcfcd492e71297031a7df1dba9dd135f1fae60e))
-
-## [1.33.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.5...v1.33.6) (2024-12-16)
-
-
-### Bug Fixes
-
-* pyproject ([bf6cb0a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bf6cb0a582004617724e11ed04ba617eb39abc0c))
-
-## [1.33.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.4...v1.33.5) (2024-12-16)
-
-
-### Bug Fixes
-
-* uv.lock ([0a7fc39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0a7fc392dea2b62122b977d62f4d85b117fc8351))
-
-## [1.33.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.3...v1.33.4) (2024-12-16)
-
-
-### Bug Fixes
-
-* context window ([ffdadae](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ffdadaed6fe3f17da535e6eddb73851fce2f4bf2))
-
-## [1.33.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.2...v1.33.3) (2024-12-11)
-
-
-### Bug Fixes
-
-* formatting ([d1b2104](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d1b2104f28d84c5129edb29a5efdaf5bf7d22bfb))
-
 ## [1.33.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.1...v1.33.2) (2024-12-06)
 
 
@@ -57,6 +15,7 @@
 ## [1.33.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0...v1.33.0) (2024-12-05)
 
 
+
 ### Features
 
 * add api integration ([8aa9103](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8aa9103f02af92d9e1a780450daa7bb303afc150))
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,13 @@
 [project]
 name = "scrapegraphai"
-version = "1.33.8"
+
+
+
+version = "1.33.2"
+
+
+
+
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
     { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },
diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
@@ -4,6 +4,7 @@
 from langchain_core.documents import Document
 import aiohttp
 import async_timeout
+from typing import Union
 from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy
 
 logger = get_logger("web-loader")
@@ -111,14 +112,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
 
         return results
 
+    async def ascrape_playwright_scroll(
+        self, 
+        url: str, 
+        timeout: Union[int, None]=30, 
+        scroll: int=15000,
+        sleep: float=2,
+        scroll_to_bottom: bool=False
+    ) -> str:
+        """
+        Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
+
+        Notes: 
+        - The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time.
+        - If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when
+        the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual.
+        - Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load.
+        Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to
+        make sure that the scrolling actually happens, thereby allowing the page height to change.
+        - Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling.
+
+        Args:
+        - url (str): The URL to scrape.
+        - timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0.
+        Can also be set to None, in which case the scraper will only stop when the page height stops changing.
+        - scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels.
+        Less than this and we don't scroll enough to see any content change.
+        - sleep (int): The number of seconds to sleep after each scroll, to allow the page to load.
+        Defaults to 2. Must be greater than 0.
+
+        Returns:
+            str: The scraped HTML content
+
+        Raises:
+        - ValueError: If the timeout value is less than or equal to 0.
+        - ValueError: If the sleep value is less than or equal to 0.
+        - ValueError: If the scroll value is less than 5000.
+        """
+        # NB: I have tested using scrollHeight to determine when to stop scrolling
+        # but it doesn't always work as expected. The page height doesn't change on some sites like 
+        # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
+        # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?
+
+        if timeout and timeout <= 0:
+            raise ValueError("If set, timeout value for scrolling scraper must be greater than 0.")
+        
+        if sleep <= 0:
+            raise ValueError("Sleep for scrolling scraper value must be greater than 0.")
+        
+        if scroll < 5000:
+            raise ValueError("Scroll value for scrolling scraper must be greater than or equal to 5000.")
+        
+        from playwright.async_api import async_playwright
+        from undetected_playwright import Malenia
+        import time
+
+        logger.info(f"Starting scraping with scrolling support for {url}...")
+
+        results = ""
+        attempt = 0
+
+        while attempt < self.RETRY_LIMIT:
+            try:
+                async with async_playwright() as p:
+                    browser = await p.chromium.launch(
+                        headless=self.headless, proxy=self.proxy, **self.browser_config
+                    )
+                    context = await browser.new_context()
+                    await Malenia.apply_stealth(context)
+                    page = await context.new_page()
+                    await page.goto(url, wait_until="domcontentloaded")
+                    await page.wait_for_load_state(self.load_state)
+
+                    previous_height = None
+                    start_time = time.time()
+
+                    # Store the heights of the page after each scroll
+                    # This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom
+                    # or simly when the page stops changing for some reason.
+                    heights = []
+
+                    while True:
+                        current_height = await page.evaluate("document.body.scrollHeight")
+                        heights.append(current_height)
+                        heights = heights[-5:] # Keep only the last 5 heights, to not run out of memory
+
+                        # Break if we've reached the bottom of the page i.e. if scrolling makes no more progress
+                        # Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading
+                        # or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout.
+                        if scroll_to_bottom and previous_height == current_height:
+                            logger.info(f"Reached bottom of page for url {url}")
+                            break
+
+                        previous_height = current_height
+
+                        await page.mouse.wheel(0, scroll)
+                        logger.debug(f"Scrolled {url} to current height {current_height}px...")
+                        time.sleep(sleep)  # Allow some time for any lazy-loaded content to load
+
+                        current_time = time.time()
+                        elapsed_time = current_time - start_time
+                        logger.debug(f"Elapsed time: {elapsed_time} seconds")
+
+                        if timeout:
+                            if elapsed_time >= timeout:
+                                logger.info(f"Reached timeout of {timeout} seconds for url {url}")
+                                break
+                            elif len(heights) == 5 and len(set(heights)) == 1:
+                                logger.info(f"Page height has not changed for url {url} for the last 5 scrolls. Stopping.")
+                                break
+                    
+                    results = await page.content()
+                    break
+
+            except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
+                attempt += 1
+                logger.error(f"Attempt {attempt} failed: {e}")
+                if attempt == self.RETRY_LIMIT:
+                    results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+            finally:
+                await browser.close()
+
+        return results
+
     async def ascrape_playwright(self, url: str) -> str:
         """
         Asynchronously scrape the content of a given URL using Playwright's async API.
+
+        Args:
+            url (str): The URL to scrape.
+
+        Returns:
+            str: The scraped HTML content or an error message if an exception occurs.
         """
         from playwright.async_api import async_playwright
         from undetected_playwright import Malenia
 
         logger.info(f"Starting scraping with {self.backend}...")
+        results = ""
         attempt = 0
 
         while attempt < self.RETRY_LIMIT:
@@ -136,19 +267,21 @@ async def ascrape_playwright(self, url: str) -> str:
                     await page.wait_for_load_state(self.load_state)
                     results = await page.content()
                     logger.info("Content scraped")
-                    return results
+                    break
             except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
                 attempt += 1
                 logger.error(f"Attempt {attempt} failed: {e}")
                 if attempt == self.RETRY_LIMIT:
-                    raise RuntimeError(
-                        f"Failed to fetch {url} after {self.RETRY_LIMIT} attempts: {e}"
-                    )
+                    results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
             finally:
                 if "browser" in locals():
                     await browser.close()
 
 
+        return results
+
+
+
     async def ascrape_with_js_support(self, url: str) -> str:
         """
         Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -254,4 +254,7 @@
         "mixtral-moe-8x22B-instruct": 65536,
         "mixtral-moe-8x7B-instruct": 65536,
     },
+    "togetherai" : {
+        "Meta-Llama-3.1-70B-Instruct-Turbo": 128000
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -254,4 +254,7 @@`
`254`	`254`	`"mixtral-moe-8x22B-instruct": 65536,`
`255`	`255`	`"mixtral-moe-8x7B-instruct": 65536,`
`256`	`256`	`},`
	`257`	`+ "togetherai" : {`
	`258`	`+ "Meta-Llama-3.1-70B-Instruct-Turbo": 128000`
	`259`	`+ }`
`257`	`260`	`}`