1+ import asyncio
2+ import time
13from typing import Any , Optional , Type , overload
2- from dendrite_sdk .async_api ._api .dto .scrape_page_dto import ScrapePageDTO
4+ from dendrite_sdk .async_api ._api .dto .extract_dto import ExtractDTO
35from dendrite_sdk .async_api ._core ._type_spec import (
46 JsonSchema ,
57 PydanticModel ,
@@ -25,6 +27,7 @@ async def extract(
2527 prompt : str ,
2628 type_spec : Type [bool ],
2729 use_cache : bool = True ,
30+ timeout : int = 180 ,
2831 ) -> bool : ...
2932
3033 @overload
@@ -33,6 +36,7 @@ async def extract(
3336 prompt : str ,
3437 type_spec : Type [int ],
3538 use_cache : bool = True ,
39+ timeout : int = 180 ,
3640 ) -> int : ...
3741
3842 @overload
@@ -41,6 +45,7 @@ async def extract(
4145 prompt : str ,
4246 type_spec : Type [float ],
4347 use_cache : bool = True ,
48+ timeout : int = 180 ,
4449 ) -> float : ...
4550
4651 @overload
@@ -49,6 +54,7 @@ async def extract(
4954 prompt : str ,
5055 type_spec : Type [str ],
5156 use_cache : bool = True ,
57+ timeout : int = 180 ,
5258 ) -> str : ...
5359
5460 @overload
@@ -57,6 +63,7 @@ async def extract(
5763 prompt : Optional [str ],
5864 type_spec : Type [PydanticModel ],
5965 use_cache : bool = True ,
66+ timeout : int = 180 ,
6067 ) -> PydanticModel : ...
6168
6269 @overload
@@ -65,6 +72,7 @@ async def extract(
6572 prompt : Optional [str ],
6673 type_spec : JsonSchema ,
6774 use_cache : bool = True ,
75+ timeout : int = 180 ,
6876 ) -> JsonSchema : ...
6977
7078 @overload
@@ -73,13 +81,15 @@ async def extract(
7381 prompt : str ,
7482 type_spec : None = None ,
7583 use_cache : bool = True ,
84+ timeout : int = 180 ,
7685 ) -> Any : ...
7786
7887 async def extract (
7988 self ,
8089 prompt : Optional [str ],
8190 type_spec : Optional [TypeSpec ] = None ,
8291 use_cache : bool = True ,
92+ timeout : int = 180 ,
8393 ) -> TypeSpec :
8494 """
8595 Extract data from a web page based on a prompt and optional type specification.
@@ -88,9 +98,13 @@ async def extract(
8898 prompt (Optional[str]): The prompt to guide the extraction.
8999 type_spec (Optional[TypeSpec], optional): The type specification for the extracted data.
90100 use_cache (bool, optional): Whether to use cached results. Defaults to True.
101+ timeout (int, optional): The maximum time to wait for extraction in seconds. Defaults to 180 seconds, which is 3 minutes.
91102
92103 Returns:
93- ScrapePageResponse: The extracted data wrapped in a ScrapePageResponse object.
104+ ExtractResponse: The extracted data wrapped in a ExtractResponse object.
105+
106+ Raises:
107+ TimeoutError: If the extraction process exceeds the specified timeout.
94108 """
95109 json_schema = None
96110 if type_spec :
@@ -99,22 +113,40 @@ async def extract(
99113 if prompt is None :
100114 prompt = ""
101115
116+ init_start_time = time .time ()
102117 page = await self ._get_page ()
118+
103119 page_information = await page .get_page_information ()
104- scrape_dto = ScrapePageDTO (
120+ extract_dto = ExtractDTO (
105121 page_information = page_information ,
106122 api_config = self ._get_dendrite_browser ().api_config ,
107123 prompt = prompt ,
108124 return_data_json_schema = json_schema ,
109125 use_screenshot = True ,
110126 use_cache = use_cache ,
111127 )
112- res = await self ._get_browser_api_client ().scrape_page (scrape_dto )
128+
129+ delay = 1
130+ while True :
131+ elapsed_time = time .time () - init_start_time
132+ if elapsed_time > timeout :
133+ raise TimeoutError (
134+ f"Extraction process exceeded the timeout of { timeout } seconds"
135+ )
136+
137+ start_time = time .time ()
138+ res = await self ._get_browser_api_client ().extract (extract_dto )
139+ request_time = time .time () - start_time
140+
141+ if res .status != "loading" :
142+ break
143+
144+ remaining_delay = max (0 , delay - request_time )
145+ await asyncio .sleep (remaining_delay )
146+ delay = min (delay * 2 , 20 ) # Exponential back-off, capped at 60 seconds
113147
114148 converted_res = res .return_data
115149 if type_spec is not None :
116150 converted_res = convert_to_type_spec (type_spec , res .return_data )
117151
118- res = converted_res
119-
120- return res
152+ return converted_res
0 commit comments