Skip to content

Commit 84548d0

Browse files
committed
Resolved merge conflict in scrapegraphai/helpers/models_tokens.py
2 parents a4f0f5d + f97c45c commit 84548d0

File tree

4 files changed

+149
-47
lines changed

4 files changed

+149
-47
lines changed

CHANGELOG.md

Lines changed: 1 addition & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,3 @@
1-
## [1.33.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.7...v1.33.8) (2024-12-16)
2-
3-
4-
### Bug Fixes
5-
6-
* pyproject ([76ac0a2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/76ac0a2141d9d53af023a405e2c61849921e4f0e))
7-
8-
## [1.33.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.6...v1.33.7) (2024-12-16)
9-
10-
11-
### Bug Fixes
12-
13-
* pyproject ([3dcfcd4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3dcfcd492e71297031a7df1dba9dd135f1fae60e))
14-
15-
## [1.33.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.5...v1.33.6) (2024-12-16)
16-
17-
18-
### Bug Fixes
19-
20-
* pyproject ([bf6cb0a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bf6cb0a582004617724e11ed04ba617eb39abc0c))
21-
22-
## [1.33.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.4...v1.33.5) (2024-12-16)
23-
24-
25-
### Bug Fixes
26-
27-
* uv.lock ([0a7fc39](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0a7fc392dea2b62122b977d62f4d85b117fc8351))
28-
29-
## [1.33.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.3...v1.33.4) (2024-12-16)
30-
31-
32-
### Bug Fixes
33-
34-
* context window ([ffdadae](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ffdadaed6fe3f17da535e6eddb73851fce2f4bf2))
35-
36-
## [1.33.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.2...v1.33.3) (2024-12-11)
37-
38-
39-
### Bug Fixes
40-
41-
* formatting ([d1b2104](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d1b2104f28d84c5129edb29a5efdaf5bf7d22bfb))
42-
431
## [1.33.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.33.1...v1.33.2) (2024-12-06)
442

453

@@ -57,6 +15,7 @@
5715
## [1.33.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0...v1.33.0) (2024-12-05)
5816

5917

18+
6019
### Features
6120

6221
* add api integration ([8aa9103](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8aa9103f02af92d9e1a780450daa7bb303afc150))

pyproject.toml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
[project]
22
name = "scrapegraphai"
3-
version = "1.33.8"
3+
4+
5+
6+
version = "1.33.2"
7+
8+
9+
10+
411
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
512
authors = [
613
{ name = "Marco Vinciguerra", email = "[email protected]" },

scrapegraphai/docloaders/chromium.py

Lines changed: 137 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from langchain_core.documents import Document
55
import aiohttp
66
import async_timeout
7+
from typing import Union
78
from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy
89

910
logger = get_logger("web-loader")
@@ -111,14 +112,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
111112

112113
return results
113114

115+
async def ascrape_playwright_scroll(
116+
self,
117+
url: str,
118+
timeout: Union[int, None]=30,
119+
scroll: int=15000,
120+
sleep: float=2,
121+
scroll_to_bottom: bool=False
122+
) -> str:
123+
"""
124+
Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
125+
126+
Notes:
127+
- The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time.
128+
- If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when
129+
the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual.
130+
- Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load.
131+
Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to
132+
make sure that the scrolling actually happens, thereby allowing the page height to change.
133+
- Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling.
134+
135+
Args:
136+
- url (str): The URL to scrape.
137+
- timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0.
138+
Can also be set to None, in which case the scraper will only stop when the page height stops changing.
139+
- scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels.
140+
Less than this and we don't scroll enough to see any content change.
141+
- sleep (int): The number of seconds to sleep after each scroll, to allow the page to load.
142+
Defaults to 2. Must be greater than 0.
143+
144+
Returns:
145+
str: The scraped HTML content
146+
147+
Raises:
148+
- ValueError: If the timeout value is less than or equal to 0.
149+
- ValueError: If the sleep value is less than or equal to 0.
150+
- ValueError: If the scroll value is less than 5000.
151+
"""
152+
# NB: I have tested using scrollHeight to determine when to stop scrolling
153+
# but it doesn't always work as expected. The page height doesn't change on some sites like
154+
# https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
155+
# In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?
156+
157+
if timeout and timeout <= 0:
158+
raise ValueError("If set, timeout value for scrolling scraper must be greater than 0.")
159+
160+
if sleep <= 0:
161+
raise ValueError("Sleep for scrolling scraper value must be greater than 0.")
162+
163+
if scroll < 5000:
164+
raise ValueError("Scroll value for scrolling scraper must be greater than or equal to 5000.")
165+
166+
from playwright.async_api import async_playwright
167+
from undetected_playwright import Malenia
168+
import time
169+
170+
logger.info(f"Starting scraping with scrolling support for {url}...")
171+
172+
results = ""
173+
attempt = 0
174+
175+
while attempt < self.RETRY_LIMIT:
176+
try:
177+
async with async_playwright() as p:
178+
browser = await p.chromium.launch(
179+
headless=self.headless, proxy=self.proxy, **self.browser_config
180+
)
181+
context = await browser.new_context()
182+
await Malenia.apply_stealth(context)
183+
page = await context.new_page()
184+
await page.goto(url, wait_until="domcontentloaded")
185+
await page.wait_for_load_state(self.load_state)
186+
187+
previous_height = None
188+
start_time = time.time()
189+
190+
# Store the heights of the page after each scroll
191+
# This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom
192+
# or simly when the page stops changing for some reason.
193+
heights = []
194+
195+
while True:
196+
current_height = await page.evaluate("document.body.scrollHeight")
197+
heights.append(current_height)
198+
heights = heights[-5:] # Keep only the last 5 heights, to not run out of memory
199+
200+
# Break if we've reached the bottom of the page i.e. if scrolling makes no more progress
201+
# Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading
202+
# or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout.
203+
if scroll_to_bottom and previous_height == current_height:
204+
logger.info(f"Reached bottom of page for url {url}")
205+
break
206+
207+
previous_height = current_height
208+
209+
await page.mouse.wheel(0, scroll)
210+
logger.debug(f"Scrolled {url} to current height {current_height}px...")
211+
time.sleep(sleep) # Allow some time for any lazy-loaded content to load
212+
213+
current_time = time.time()
214+
elapsed_time = current_time - start_time
215+
logger.debug(f"Elapsed time: {elapsed_time} seconds")
216+
217+
if timeout:
218+
if elapsed_time >= timeout:
219+
logger.info(f"Reached timeout of {timeout} seconds for url {url}")
220+
break
221+
elif len(heights) == 5 and len(set(heights)) == 1:
222+
logger.info(f"Page height has not changed for url {url} for the last 5 scrolls. Stopping.")
223+
break
224+
225+
results = await page.content()
226+
break
227+
228+
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
229+
attempt += 1
230+
logger.error(f"Attempt {attempt} failed: {e}")
231+
if attempt == self.RETRY_LIMIT:
232+
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
233+
finally:
234+
await browser.close()
235+
236+
return results
237+
114238
async def ascrape_playwright(self, url: str) -> str:
115239
"""
116240
Asynchronously scrape the content of a given URL using Playwright's async API.
241+
242+
Args:
243+
url (str): The URL to scrape.
244+
245+
Returns:
246+
str: The scraped HTML content or an error message if an exception occurs.
117247
"""
118248
from playwright.async_api import async_playwright
119249
from undetected_playwright import Malenia
120250

121251
logger.info(f"Starting scraping with {self.backend}...")
252+
results = ""
122253
attempt = 0
123254

124255
while attempt < self.RETRY_LIMIT:
@@ -136,19 +267,21 @@ async def ascrape_playwright(self, url: str) -> str:
136267
await page.wait_for_load_state(self.load_state)
137268
results = await page.content()
138269
logger.info("Content scraped")
139-
return results
270+
break
140271
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
141272
attempt += 1
142273
logger.error(f"Attempt {attempt} failed: {e}")
143274
if attempt == self.RETRY_LIMIT:
144-
raise RuntimeError(
145-
f"Failed to fetch {url} after {self.RETRY_LIMIT} attempts: {e}"
146-
)
275+
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
147276
finally:
148277
if "browser" in locals():
149278
await browser.close()
150279

151280

281+
return results
282+
283+
284+
152285
async def ascrape_with_js_support(self, url: str) -> str:
153286
"""
154287
Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.

scrapegraphai/helpers/models_tokens.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,4 +254,7 @@
254254
"mixtral-moe-8x22B-instruct": 65536,
255255
"mixtral-moe-8x7B-instruct": 65536,
256256
},
257+
"togetherai" : {
258+
"Meta-Llama-3.1-70B-Instruct-Turbo": 128000
259+
}
257260
}

0 commit comments

Comments
 (0)