Skip to content

Commit f23f203

Browse files
Den 281 extract as markdown (#58)
* began adding markdown functionality * Changed all legacy naming to "extract". Added _last_frame_navigated_timestamp functionality to page to detect if page is still loading. Added markdownify * Loading added to extract. Also added small sleep to screenshot manager. * Cleaned up screenshot manager and added a screenshot mixin. * update poetry.lock --------- Co-authored-by: Arian Hanifi <[email protected]>
1 parent 3fd3074 commit f23f203

22 files changed

+283
-81
lines changed

dendrite_sdk/async_api/_api/browser_api_client.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,14 @@
77
from dendrite_sdk.async_api._api.dto.authenticate_dto import AuthenticateDTO
88
from dendrite_sdk.async_api._api.dto.get_elements_dto import GetElementsDTO
99
from dendrite_sdk.async_api._api.dto.make_interaction_dto import MakeInteractionDTO
10-
from dendrite_sdk.async_api._api.dto.scrape_page_dto import ScrapePageDTO
10+
from dendrite_sdk.async_api._api.dto.extract_dto import ExtractDTO
1111
from dendrite_sdk.async_api._api.dto.try_run_script_dto import TryRunScriptDTO
1212
from dendrite_sdk.async_api._api.dto.upload_auth_session_dto import UploadAuthSessionDTO
1313
from dendrite_sdk.async_api._api.response.ask_page_response import AskPageResponse
1414
from dendrite_sdk.async_api._api.response.interaction_response import (
1515
InteractionResponse,
1616
)
17-
from dendrite_sdk.async_api._api.response.scrape_page_response import ScrapePageResponse
17+
from dendrite_sdk.async_api._api.response.extract_response import ExtractResponse
1818
from dendrite_sdk.async_api._api._http_client import HTTPClient
1919
from dendrite_sdk._common._exceptions.dendrite_exception import (
2020
InvalidAuthSessionError,
@@ -55,12 +55,12 @@ async def make_interaction(self, dto: MakeInteractionDTO) -> InteractionResponse
5555
status=res_dict["status"], message=res_dict["message"]
5656
)
5757

58-
async def scrape_page(self, dto: ScrapePageDTO) -> ScrapePageResponse:
58+
async def extract(self, dto: ExtractDTO) -> ExtractResponse:
5959
res = await self.send_request(
60-
"actions/scrape-page", data=dto.dict(), method="POST"
60+
"actions/extract-page", data=dto.dict(), method="POST"
6161
)
6262
res_dict = res.json()
63-
return ScrapePageResponse(
63+
return ExtractResponse(
6464
status=res_dict["status"],
6565
message=res_dict["message"],
6666
return_data=res_dict["return_data"],
@@ -79,9 +79,7 @@ async def ask_page(self, dto: AskPageDTO) -> AskPageResponse:
7979
return_data=res_dict["return_data"],
8080
)
8181

82-
async def try_run_cached(
83-
self, dto: TryRunScriptDTO
84-
) -> Optional[ScrapePageResponse]:
82+
async def try_run_cached(self, dto: TryRunScriptDTO) -> Optional[ExtractResponse]:
8583
res = await self.send_request(
8684
"actions/try-run-cached", data=dto.dict(), method="POST"
8785
)
@@ -92,7 +90,7 @@ async def try_run_cached(
9290
if loaded_value is None:
9391
return None
9492

95-
return ScrapePageResponse(
93+
return ExtractResponse(
9694
status=res_dict["status"],
9795
message=res_dict["message"],
9896
return_data=loaded_value,

dendrite_sdk/async_api/_api/dto/scrape_page_dto.py renamed to dendrite_sdk/async_api/_api/dto/extract_dto.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from dendrite_sdk.async_api._core.models.page_information import PageInformation
66

77

8-
class ScrapePageDTO(BaseModel):
8+
class ExtractDTO(BaseModel):
99
page_information: PageInformation
1010
api_config: APIConfig
1111
prompt: str

dendrite_sdk/async_api/_api/response/scrape_page_response.py renamed to dendrite_sdk/async_api/_api/response/extract_response.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
T = TypeVar("T")
88

99

10-
class ScrapePageResponse(BaseModel, Generic[T]):
10+
class ExtractResponse(BaseModel, Generic[T]):
1111
return_data: T
1212
message: str
1313
created_script: Optional[str] = None

dendrite_sdk/async_api/_core/_managers/screenshot_manager.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,27 @@
33
import os
44
from typing import Tuple
55
from uuid import uuid4
6-
from playwright.async_api import Page
76

87
from dendrite_sdk.async_api._core._type_spec import PlaywrightPage
98

109

1110
class ScreenshotManager:
12-
def __init__(self) -> None:
11+
def __init__(self, page: PlaywrightPage) -> None:
1312
self.screenshot_before: str = ""
1413
self.screenshot_after: str = ""
14+
self.page = page
1515

16-
async def take_full_page_screenshot(self, page: Page) -> str:
17-
image_data = await page.screenshot(type="jpeg", full_page=True, timeout=30000)
16+
async def take_full_page_screenshot(self) -> str:
17+
image_data = await self.page.screenshot(
18+
type="jpeg", full_page=True, timeout=30000
19+
)
1820
if image_data is None:
1921
return ""
2022

2123
return base64.b64encode(image_data).decode("utf-8")
2224

23-
async def take_viewport_screenshot(self, page: Page) -> str:
24-
image_data = await page.screenshot(type="jpeg", timeout=30000)
25+
async def take_viewport_screenshot(self) -> str:
26+
image_data = await self.page.screenshot(type="jpeg", timeout=30000)
2527

2628
if image_data is None:
2729
return ""
@@ -30,18 +32,6 @@ async def take_viewport_screenshot(self, page: Page) -> str:
3032

3133
return reduced_base64
3234

33-
async def start_recording_diff(self, page: Page):
34-
self.screenshot_before = await self.take_viewport_screenshot(page)
35-
36-
async def get_diff_images(
37-
self,
38-
page: PlaywrightPage,
39-
wait_time=1,
40-
) -> Tuple[str, str]:
41-
await asyncio.sleep(wait_time)
42-
self.screenshot_after = await self.take_viewport_screenshot(page)
43-
return self.screenshot_before, self.screenshot_after
44-
4535
def store_screenshot(self, name, image_data):
4636
if not name:
4737
name = str(uuid4())

dendrite_sdk/async_api/_core/dendrite_browser.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@
3131
from dendrite_sdk.async_api._core.mixin.fill_fields import FillFieldsMixin
3232
from dendrite_sdk.async_api._core.mixin.get_element import GetElementMixin
3333
from dendrite_sdk.async_api._core.mixin.keyboard import KeyboardMixin
34+
from dendrite_sdk.async_api._core.mixin.screenshot import ScreenshotMixin
3435
from dendrite_sdk.async_api._core.mixin.wait_for import WaitForMixin
36+
from dendrite_sdk.async_api._core.mixin.markdown import MarkdownMixin
3537
from dendrite_sdk.async_api._core.models.authentication import (
3638
AuthSession,
3739
)
@@ -47,8 +49,10 @@
4749

4850

4951
class AsyncDendrite(
50-
ExtractionMixin,
52+
ScreenshotMixin,
5153
WaitForMixin,
54+
MarkdownMixin,
55+
ExtractionMixin,
5256
AskMixin,
5357
FillFieldsMixin,
5458
ClickMixin,

dendrite_sdk/async_api/_core/dendrite_page.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,16 @@ def __init__(
7676
browser_api_client: "BrowserAPIClient",
7777
):
7878
self.playwright_page = page
79-
self.screenshot_manager = ScreenshotManager()
79+
self.screenshot_manager = ScreenshotManager(page)
8080
self.dendrite_browser = dendrite_browser
8181
self._browser_api_client = browser_api_client
82+
self._last_frame_navigated_timestamp = time.time()
83+
84+
self.playwright_page.on("framenavigated", self._on_frame_navigated)
85+
86+
def _on_frame_navigated(self, frame):
87+
if frame is self.playwright_page.main_frame:
88+
self._last_frame_navigated_timestamp = time.time()
8289

8390
@property
8491
def url(self):
@@ -246,15 +253,14 @@ async def get_page_information(self) -> PageInformation:
246253
PageInformation: An object containing the page's URL, raw HTML, and a screenshot in base64 format.
247254
"""
248255

249-
base64 = await self.screenshot_manager.take_full_page_screenshot(
250-
self.playwright_page
251-
)
256+
base64 = await self.screenshot_manager.take_full_page_screenshot()
252257
soup = await self._get_soup()
253258

254259
return PageInformation(
255260
url=self.playwright_page.url,
256261
raw_html=str(soup),
257262
screenshot_base64=base64,
263+
time_since_frame_navigated=self.get_time_since_last_frame_navigated(),
258264
)
259265

260266
async def _generate_dendrite_ids(self):
@@ -393,3 +399,12 @@ async def _dump_html(self, path: str) -> None:
393399

394400
with open(path, "w") as f:
395401
f.write(await self.playwright_page.content())
402+
403+
def get_time_since_last_frame_navigated(self) -> float:
404+
"""
405+
Get the time elapsed since the last URL change.
406+
407+
Returns:
408+
float: The number of seconds elapsed since the last URL change.
409+
"""
410+
return time.time() - self._last_frame_navigated_timestamp

dendrite_sdk/async_api/_core/mixin/extract.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
import asyncio
2+
import time
13
from typing import Any, Optional, Type, overload
2-
from dendrite_sdk.async_api._api.dto.scrape_page_dto import ScrapePageDTO
4+
from dendrite_sdk.async_api._api.dto.extract_dto import ExtractDTO
35
from dendrite_sdk.async_api._core._type_spec import (
46
JsonSchema,
57
PydanticModel,
@@ -25,6 +27,7 @@ async def extract(
2527
prompt: str,
2628
type_spec: Type[bool],
2729
use_cache: bool = True,
30+
timeout: int = 180,
2831
) -> bool: ...
2932

3033
@overload
@@ -33,6 +36,7 @@ async def extract(
3336
prompt: str,
3437
type_spec: Type[int],
3538
use_cache: bool = True,
39+
timeout: int = 180,
3640
) -> int: ...
3741

3842
@overload
@@ -41,6 +45,7 @@ async def extract(
4145
prompt: str,
4246
type_spec: Type[float],
4347
use_cache: bool = True,
48+
timeout: int = 180,
4449
) -> float: ...
4550

4651
@overload
@@ -49,6 +54,7 @@ async def extract(
4954
prompt: str,
5055
type_spec: Type[str],
5156
use_cache: bool = True,
57+
timeout: int = 180,
5258
) -> str: ...
5359

5460
@overload
@@ -57,6 +63,7 @@ async def extract(
5763
prompt: Optional[str],
5864
type_spec: Type[PydanticModel],
5965
use_cache: bool = True,
66+
timeout: int = 180,
6067
) -> PydanticModel: ...
6168

6269
@overload
@@ -65,6 +72,7 @@ async def extract(
6572
prompt: Optional[str],
6673
type_spec: JsonSchema,
6774
use_cache: bool = True,
75+
timeout: int = 180,
6876
) -> JsonSchema: ...
6977

7078
@overload
@@ -73,13 +81,15 @@ async def extract(
7381
prompt: str,
7482
type_spec: None = None,
7583
use_cache: bool = True,
84+
timeout: int = 180,
7685
) -> Any: ...
7786

7887
async def extract(
7988
self,
8089
prompt: Optional[str],
8190
type_spec: Optional[TypeSpec] = None,
8291
use_cache: bool = True,
92+
timeout: int = 180,
8393
) -> TypeSpec:
8494
"""
8595
Extract data from a web page based on a prompt and optional type specification.
@@ -88,9 +98,13 @@ async def extract(
8898
prompt (Optional[str]): The prompt to guide the extraction.
8999
type_spec (Optional[TypeSpec], optional): The type specification for the extracted data.
90100
use_cache (bool, optional): Whether to use cached results. Defaults to True.
101+
timeout (int, optional): The maximum time to wait for extraction in seconds. Defaults to 180 seconds, which is 3 minutes.
91102
92103
Returns:
93-
ScrapePageResponse: The extracted data wrapped in a ScrapePageResponse object.
104+
ExtractResponse: The extracted data wrapped in a ExtractResponse object.
105+
106+
Raises:
107+
TimeoutError: If the extraction process exceeds the specified timeout.
94108
"""
95109
json_schema = None
96110
if type_spec:
@@ -99,22 +113,40 @@ async def extract(
99113
if prompt is None:
100114
prompt = ""
101115

116+
init_start_time = time.time()
102117
page = await self._get_page()
118+
103119
page_information = await page.get_page_information()
104-
scrape_dto = ScrapePageDTO(
120+
extract_dto = ExtractDTO(
105121
page_information=page_information,
106122
api_config=self._get_dendrite_browser().api_config,
107123
prompt=prompt,
108124
return_data_json_schema=json_schema,
109125
use_screenshot=True,
110126
use_cache=use_cache,
111127
)
112-
res = await self._get_browser_api_client().scrape_page(scrape_dto)
128+
129+
delay = 1
130+
while True:
131+
elapsed_time = time.time() - init_start_time
132+
if elapsed_time > timeout:
133+
raise TimeoutError(
134+
f"Extraction process exceeded the timeout of {timeout} seconds"
135+
)
136+
137+
start_time = time.time()
138+
res = await self._get_browser_api_client().extract(extract_dto)
139+
request_time = time.time() - start_time
140+
141+
if res.status != "loading":
142+
break
143+
144+
remaining_delay = max(0, delay - request_time)
145+
await asyncio.sleep(remaining_delay)
146+
delay = min(delay * 2, 20) # Exponential back-off, capped at 60 seconds
113147

114148
converted_res = res.return_data
115149
if type_spec is not None:
116150
converted_res = convert_to_type_spec(type_spec, res.return_data)
117151

118-
res = converted_res
119-
120-
return res
152+
return converted_res
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from typing import Optional
2+
from dendrite_sdk.async_api._core.mixin.extract import ExtractionMixin
3+
from dendrite_sdk.async_api._core.protocol.page_protocol import DendritePageProtocol
4+
5+
from markdownify import markdownify as md
6+
7+
8+
class MarkdownMixin(ExtractionMixin, DendritePageProtocol):
9+
async def markdown(self, prompt: Optional[str] = None):
10+
page = await self._get_page()
11+
page_information = await page.get_page_information()
12+
if prompt:
13+
extract_prompt = f"Extract and return the html for this requested section of the website:\n\n{prompt}"
14+
res = await self.extract(extract_prompt, str)
15+
return md(res, heading_style="ATX")
16+
else:
17+
return md(page_information.raw_html)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from dendrite_sdk.async_api._core.protocol.page_protocol import DendritePageProtocol
2+
3+
4+
class ScreenshotMixin(DendritePageProtocol):
5+
6+
async def screenshot(self, full_page: bool = False) -> str:
7+
"""
8+
Take a screenshot of the current page.
9+
10+
Args:
11+
full_page (bool, optional): If True, captures the full page. If False, captures only the viewport. Defaults to False.
12+
13+
Returns:
14+
str: A base64 encoded string of the screenshot in JPEG format.
15+
"""
16+
page = await self._get_page()
17+
if full_page:
18+
return await page.screenshot_manager.take_full_page_screenshot()
19+
else:
20+
return await page.screenshot_manager.take_viewport_screenshot()

dendrite_sdk/async_api/_core/models/page_information.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ class PageInformation(BaseModel):
1212
url: str
1313
raw_html: str
1414
screenshot_base64: str
15+
time_since_frame_navigated: float

0 commit comments

Comments
 (0)