1
+ import pdb
2
+
1
3
import pyperclip
2
4
from typing import Optional , Type
3
5
from pydantic import BaseModel
21
23
22
24
logger = logging .getLogger (__name__ )
23
25
26
+
24
27
class CustomController (Controller ):
25
28
def __init__ (self , exclude_actions : list [str ] = [],
26
- output_model : Optional [Type [BaseModel ]] = None
27
- ):
29
+ output_model : Optional [Type [BaseModel ]] = None
30
+ ):
28
31
super ().__init__ (exclude_actions = exclude_actions , output_model = output_model )
29
32
self ._register_custom_actions ()
30
33
@@ -44,20 +47,25 @@ async def paste_from_clipboard(browser: BrowserContext):
44
47
await page .keyboard .type (text )
45
48
46
49
return ActionResult (extracted_content = text )
47
-
50
+
48
51
@self .registry .action (
49
52
'Extract page content to get the pure text or markdown with links if include_links is set to true' ,
50
53
param_model = ExtractPageContentAction ,
51
54
requires_browser = True ,
52
55
)
53
56
async def extract_content (params : ExtractPageContentAction , browser : BrowserContext ):
54
57
page = await browser .get_current_page ()
58
+ # use jina reader
59
+ url = page .url
60
+ jina_url = f"https://r.jina.ai/{ url } "
61
+ await page .goto (jina_url )
55
62
output_format = 'markdown' if params .include_links else 'text'
56
63
content = MainContentExtractor .extract ( # type: ignore
57
64
html = await page .content (),
58
65
output_format = output_format ,
59
66
)
60
- title = await page .title ()
61
- msg = f'📄 Page url: { page .url } , Page title: { title } , Extracted page content as { output_format } \n : { content } \n '
67
+ # go back to org url
68
+ await page .go_back ()
69
+ msg = f'📄 Extracted page content as { output_format } \n : { content } \n '
62
70
logger .info (msg )
63
71
return ActionResult (extracted_content = msg )
0 commit comments