Skip to content

Commit d803fc4

Browse files
parthsompuraParth Pathakmdrxy
authored
feat(document_loaders): add flexible timeout to PlaywrightURLLoader (#104)
## Description This PR enhances the `PlaywrightURLLoader` by adding configurable timeout and page load strategy options, making it more flexible for handling dynamic web pages. This addresses issue #103. ### Changes - Added `timeout` parameter (default: 30000ms) to control page navigation timeout - Added `wait_until` parameter to control when navigation is considered complete - Supported `wait_until` options: - `"load"` (default): wait for the "load" event - `"domcontentloaded"`: wait for the "DOMContentLoaded" event - `"networkidle"`: wait until there are no network connections for at least 500ms - `"commit"`: wait for the first network request to be sent ### Why The current implementation has a hardcoded 30-second timeout, which can be insufficient for heavy dynamic pages. This change allows users to: - Set longer timeouts for complex pages - Choose appropriate page load strategies based on their needs - Better handle dynamic content loading ### Real-World Examples This PR solves timeout issues with various types of websites: 1. Weather websites: ```python loader = PlaywrightURLLoader( urls=["https://weather.com/en-IN/weather/tenday/l/Chennai+Tamil+Nadu?canonicalCityId=251b7b4afedf19f747b425e048038eb1"], timeout=60000, # 60 second timeout wait_until="domcontentloaded" ) ``` 2. Dynamic news sites: ```python loader = PlaywrightURLLoader( urls=["https://www.reuters.com/markets/"], timeout=45000, wait_until="networkidle" ) ``` 3. E-commerce sites: ```python loader = PlaywrightURLLoader( urls=["https://www.amazon.com/dp/B08N5KWB9H"], timeout=90000, # 90 second timeout for complex product pages wait_until="load" ) ``` ### Testing - Added new test cases for both sync and async methods - Maintained backward compatibility - All existing tests pass - Tested with various real-world websites ### Related Issues Closes #103 --------- Co-authored-by: Parth Pathak <[email protected]> Co-authored-by: Mason Daugherty <[email protected]> Co-authored-by: Mason Daugherty <[email protected]>
1 parent 2a43e89 commit d803fc4

File tree

2 files changed

+58
-5
lines changed

2 files changed

+58
-5
lines changed

libs/community/langchain_community/document_loaders/url_playwright.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,16 @@ class PlaywrightURLLoader(BaseLoader):
116116
through the specified proxy.
117117
browser_session (Optional[Union[str, os.PathLike[str]]]): Path to a file with
118118
browser session data that can be used to restore the browser session.
119+
timeout (Optional[int]): Timeout in milliseconds for page navigation.
120+
wait_until (Optional[str]): When to consider navigation succeeded.
121+
122+
Can be one of:
123+
124+
- "load": wait for the "load" event to fire
125+
- "domcontentloaded": wait for the "DOMContentLoaded" event to fire
126+
- "networkidle": wait until there are no network connections for at least
127+
500ms
128+
- "commit": wait for the first network request to be sent
119129
120130
Example:
121131
.. code-block:: python
@@ -128,7 +138,12 @@ class PlaywrightURLLoader(BaseLoader):
128138
"username": "username",
129139
"password": "password"
130140
}
131-
loader = PlaywrightURLLoader(urls, proxy=proxy)
141+
loader = PlaywrightURLLoader(
142+
urls=urls,
143+
proxy=proxy,
144+
timeout=60000, # 60 second timeout
145+
wait_until="domcontentloaded" # Wait for DOM content to load
146+
)
132147
data = loader.load()
133148
"""
134149

@@ -141,6 +156,8 @@ def __init__(
141156
evaluator: Optional[PlaywrightEvaluator] = None,
142157
proxy: Optional[Dict[str, str]] = None,
143158
browser_session: Optional[Union[str, os.PathLike[str]]] = None,
159+
timeout: Optional[int] = 30000,
160+
wait_until: Optional[str] = "load",
144161
):
145162
"""Load a list of URLs using Playwright."""
146163
try:
@@ -156,6 +173,8 @@ def __init__(
156173
self.headless = headless
157174
self.proxy = proxy
158175
self.browser_session = browser_session
176+
self.timeout = timeout
177+
self.wait_until = wait_until
159178

160179
if remove_selectors and evaluator:
161180
raise ValueError(
@@ -189,11 +208,13 @@ def lazy_load(self) -> Iterator[Document]:
189208
for url in self.urls:
190209
try:
191210
page = context.new_page()
192-
response = page.goto(url)
211+
response = page.goto(
212+
url, timeout=self.timeout, wait_until=self.wait_until
213+
)
193214
if response is None:
194215
raise ValueError(f"page.goto() returned None for url {url}")
195216

196-
page.wait_for_load_state("load")
217+
page.wait_for_load_state(self.wait_until, timeout=self.timeout)
197218

198219
text = self.evaluator.evaluate(page, browser, response)
199220
page.close()
@@ -244,11 +265,15 @@ async def alazy_load(self) -> AsyncIterator[Document]:
244265
for url in self.urls:
245266
try:
246267
page = await context.new_page()
247-
response = await page.goto(url)
268+
response = await page.goto(
269+
url, timeout=self.timeout, wait_until=self.wait_until
270+
)
248271
if response is None:
249272
raise ValueError(f"page.goto() returned None for url {url}")
250273

251-
await page.wait_for_load_state("load")
274+
await page.wait_for_load_state(
275+
self.wait_until, timeout=self.timeout
276+
)
252277

253278
text = await self.evaluator.evaluate_async(page, browser, response)
254279
await page.close()

libs/community/tests/integration_tests/document_loaders/test_url_playwright.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,20 @@ def test_playwright_url_loader() -> None:
4242
assert len(docs) > 0
4343

4444

45+
def test_playwright_url_loader_with_timeout() -> None:
46+
"""Test Playwright URL loader with custom timeout and wait_until."""
47+
urls = ["https://techmeme.com"]
48+
loader = PlaywrightURLLoader(
49+
urls=urls,
50+
timeout=60000, # 60 second timeout
51+
wait_until="domcontentloaded", # Wait for DOM content to load
52+
continue_on_failure=False,
53+
headless=True,
54+
)
55+
docs = loader.load()
56+
assert len(docs) > 0
57+
58+
4559
async def test_playwright_async_url_loader() -> None:
4660
"""Test Playwright async URL loader."""
4761
urls = [
@@ -60,6 +74,20 @@ async def test_playwright_async_url_loader() -> None:
6074
assert len(docs) > 0
6175

6276

77+
async def test_playwright_async_url_loader_with_timeout() -> None:
78+
"""Test Playwright async URL loader with custom timeout and wait_until."""
79+
urls = ["https://techmeme.com"]
80+
loader = PlaywrightURLLoader(
81+
urls=urls,
82+
timeout=60000, # 60 second timeout
83+
wait_until="domcontentloaded", # Wait for DOM content to load
84+
continue_on_failure=False,
85+
headless=True,
86+
)
87+
docs = await loader.aload()
88+
assert len(docs) > 0
89+
90+
6391
def test_playwright_url_loader_with_custom_evaluator() -> None:
6492
"""Test Playwright URL loader with a custom evaluator."""
6593
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]

0 commit comments

Comments
 (0)