1
1
# flake8: noqa
2
- # type: ignore
3
2
import time
4
3
from sys import platform
5
-
6
- black_listed_elements = {
4
+ from typing import (
5
+ TYPE_CHECKING ,
6
+ Any ,
7
+ Dict ,
8
+ Iterable ,
9
+ List ,
10
+ Optional ,
11
+ Set ,
12
+ Tuple ,
13
+ TypedDict ,
14
+ Union ,
15
+ )
16
+
17
+ if TYPE_CHECKING :
18
+ from playwright .sync_api import Browser , CDPSession , Page , sync_playwright
19
+
20
+ black_listed_elements : Set [str ] = {
7
21
"html" ,
8
22
"head" ,
9
23
"title" ,
19
33
}
20
34
21
35
36
+ class ElementInViewPort (TypedDict ):
37
+ node_index : str
38
+ backend_node_id : int
39
+ node_name : Optional [str ]
40
+ node_value : Optional [str ]
41
+ node_meta : List [str ]
42
+ is_clickable : bool
43
+ origin_x : int
44
+ origin_y : int
45
+ center_x : int
46
+ center_y : int
47
+
48
+
22
49
class Crawler :
23
- def __init__ (self ):
50
+ def __init__ (self ) -> None :
24
51
try :
25
52
from playwright .sync_api import sync_playwright
26
53
except ImportError :
27
54
raise ValueError (
28
55
"Could not import playwright python package. "
29
56
"Please it install it with `pip install playwright`."
30
57
)
31
- self .browser = sync_playwright ().start ().chromium .launch (headless = False )
32
- self .page = self .browser .new_page ()
58
+ self .browser : Browser = (
59
+ sync_playwright ().start ().chromium .launch (headless = False )
60
+ )
61
+ self .page : Page = self .browser .new_page ()
33
62
self .page .set_viewport_size ({"width" : 1280 , "height" : 1080 })
63
+ self .page_element_buffer : Dict [int , ElementInViewPort ]
64
+ self .client : CDPSession
34
65
35
- def go_to_page (self , url ) :
66
+ def go_to_page (self , url : str ) -> None :
36
67
self .page .goto (url = url if "://" in url else "http://" + url )
37
68
self .client = self .page .context .new_cdp_session (self .page )
38
69
self .page_element_buffer = {}
39
70
40
- def scroll (self , direction ) :
71
+ def scroll (self , direction : str ) -> None :
41
72
if direction == "up" :
42
73
self .page .evaluate (
43
74
"(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop - window.innerHeight;"
@@ -47,7 +78,7 @@ def scroll(self, direction):
47
78
"(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop + window.innerHeight;"
48
79
)
49
80
50
- def click (self , id ) :
81
+ def click (self , id : Union [ str , int ]) -> None :
51
82
# Inject javascript into the page which removes the target= attribute from all links
52
83
js = """
53
84
links = document.getElementsByTagName("a");
@@ -59,41 +90,37 @@ def click(self, id):
59
90
60
91
element = self .page_element_buffer .get (int (id ))
61
92
if element :
62
- x = element . get ( "center_x" )
63
- y = element . get ( "center_y" )
93
+ x : float = element [ "center_x" ]
94
+ y : float = element [ "center_y" ]
64
95
65
96
self .page .mouse .click (x , y )
66
97
else :
67
98
print ("Could not find element" )
68
99
69
- def type (self , id , text ) :
100
+ def type (self , id : Union [ str , int ], text : str ) -> None :
70
101
self .click (id )
71
102
self .page .keyboard .type (text )
72
103
73
- def enter (self ):
104
+ def enter (self ) -> None :
74
105
self .page .keyboard .press ("Enter" )
75
106
76
- def crawl (self ):
107
+ def crawl (self ) -> List [ str ] :
77
108
page = self .page
78
109
page_element_buffer = self .page_element_buffer
79
110
start = time .time ()
80
111
81
112
page_state_as_text = []
82
113
83
- device_pixel_ratio = page .evaluate ("window.devicePixelRatio" )
114
+ device_pixel_ratio : float = page .evaluate ("window.devicePixelRatio" )
84
115
if platform == "darwin" and device_pixel_ratio == 1 : # lies
85
116
device_pixel_ratio = 2
86
117
87
- win_scroll_x = page .evaluate ("window.scrollX" )
88
- win_scroll_y = page .evaluate ("window.scrollY" )
89
- win_upper_bound = page .evaluate ("window.pageYOffset" )
90
- win_left_bound = page .evaluate ("window.pageXOffset" )
91
- win_width = page .evaluate ("window.screen.width" )
92
- win_height = page .evaluate ("window.screen.height" )
93
- win_right_bound = win_left_bound + win_width
94
- win_lower_bound = win_upper_bound + win_height
95
- document_offset_height = page .evaluate ("document.body.offsetHeight" )
96
- document_scroll_height = page .evaluate ("document.body.scrollHeight" )
118
+ win_upper_bound : float = page .evaluate ("window.pageYOffset" )
119
+ win_left_bound : float = page .evaluate ("window.pageXOffset" )
120
+ win_width : float = page .evaluate ("window.screen.width" )
121
+ win_height : float = page .evaluate ("window.screen.height" )
122
+ win_right_bound : float = win_left_bound + win_width
123
+ win_lower_bound : float = win_upper_bound + win_height
97
124
98
125
# percentage_progress_start = (win_upper_bound / document_scroll_height) * 100
99
126
# percentage_progress_end = (
@@ -116,40 +143,35 @@ def crawl(self):
116
143
"DOMSnapshot.captureSnapshot" ,
117
144
{"computedStyles" : [], "includeDOMRects" : True , "includePaintOrder" : True },
118
145
)
119
- strings = tree ["strings" ]
120
- document = tree ["documents" ][0 ]
121
- nodes = document ["nodes" ]
122
- backend_node_id = nodes ["backendNodeId" ]
123
- attributes = nodes ["attributes" ]
124
- node_value = nodes ["nodeValue" ]
125
- parent = nodes ["parentIndex" ]
126
- node_types = nodes ["nodeType" ]
127
- node_names = nodes ["nodeName" ]
128
- is_clickable = set (nodes ["isClickable" ]["index" ])
129
-
130
- text_value = nodes ["textValue" ]
131
- text_value_index = text_value ["index" ]
132
- text_value_values = text_value ["value" ]
133
-
134
- input_value = nodes ["inputValue" ]
135
- input_value_index = input_value ["index" ]
136
- input_value_values = input_value ["value" ]
137
-
138
- input_checked = nodes ["inputChecked" ]
139
- layout = document ["layout" ]
140
- layout_node_index = layout ["nodeIndex" ]
141
- bounds = layout ["bounds" ]
142
-
143
- cursor = 0
144
- html_elements_text = []
145
-
146
- child_nodes = {}
147
- elements_in_view_port = []
148
-
149
- anchor_ancestry = {"-1" : (False , None )}
150
- button_ancestry = {"-1" : (False , None )}
151
-
152
- def convert_name (node_name , has_click_handler ):
146
+ strings : Dict [int , str ] = tree ["strings" ]
147
+ document : Dict [str , Any ] = tree ["documents" ][0 ]
148
+ nodes : Dict [str , Any ] = document ["nodes" ]
149
+ backend_node_id : Dict [int , int ] = nodes ["backendNodeId" ]
150
+ attributes : Dict [int , Dict [int , Any ]] = nodes ["attributes" ]
151
+ node_value : Dict [int , int ] = nodes ["nodeValue" ]
152
+ parent : Dict [int , int ] = nodes ["parentIndex" ]
153
+ node_names : Dict [int , int ] = nodes ["nodeName" ]
154
+ is_clickable : Set [int ] = set (nodes ["isClickable" ]["index" ])
155
+
156
+ input_value : Dict [str , Any ] = nodes ["inputValue" ]
157
+ input_value_index : List [int ] = input_value ["index" ]
158
+ input_value_values : List [int ] = input_value ["value" ]
159
+
160
+ layout : Dict [str , Any ] = document ["layout" ]
161
+ layout_node_index : List [int ] = layout ["nodeIndex" ]
162
+ bounds : Dict [int , List [float ]] = layout ["bounds" ]
163
+
164
+ cursor : int = 0
165
+
166
+ child_nodes : Dict [str , List [Dict [str , Any ]]] = {}
167
+ elements_in_view_port : List [ElementInViewPort ] = []
168
+
169
+ anchor_ancestry : Dict [str , Tuple [bool , Optional [int ]]] = {"-1" : (False , None )}
170
+ button_ancestry : Dict [str , Tuple [bool , Optional [int ]]] = {"-1" : (False , None )}
171
+
172
+ def convert_name (
173
+ node_name : Optional [str ], has_click_handler : Optional [bool ]
174
+ ) -> str :
153
175
if node_name == "a" :
154
176
return "link"
155
177
if node_name == "input" :
@@ -163,7 +185,9 @@ def convert_name(node_name, has_click_handler):
163
185
else :
164
186
return "text"
165
187
166
- def find_attributes (attributes , keys ):
188
+ def find_attributes (
189
+ attributes : Dict [int , Any ], keys : List [str ]
190
+ ) -> Dict [str , str ]:
167
191
values = {}
168
192
169
193
for [key_index , value_index ] in zip (* (iter (attributes ),) * 2 ):
@@ -181,7 +205,13 @@ def find_attributes(attributes, keys):
181
205
182
206
return values
183
207
184
- def add_to_hash_tree (hash_tree , tag , node_id , node_name , parent_id ):
208
+ def add_to_hash_tree (
209
+ hash_tree : Dict [str , Tuple [bool , Optional [int ]]],
210
+ tag : str ,
211
+ node_id : int ,
212
+ node_name : Optional [str ],
213
+ parent_id : int ,
214
+ ) -> Tuple [bool , Optional [int ]]:
185
215
parent_id_str = str (parent_id )
186
216
if not parent_id_str in hash_tree :
187
217
parent_name = strings [node_names [parent_id ]].lower ()
@@ -195,7 +225,7 @@ def add_to_hash_tree(hash_tree, tag, node_id, node_name, parent_id):
195
225
196
226
# even if the anchor is nested in another anchor, we set the "root" for all descendants to be ::Self
197
227
if node_name == tag :
198
- value = (True , node_id )
228
+ value : Tuple [ bool , Optional [ int ]] = (True , node_id )
199
229
elif (
200
230
is_parent_desc_anchor
201
231
): # reuse the parent's anchor_id (which could be much higher in the tree)
@@ -212,7 +242,7 @@ def add_to_hash_tree(hash_tree, tag, node_id, node_name, parent_id):
212
242
213
243
for index , node_name_index in enumerate (node_names ):
214
244
node_parent = parent [index ]
215
- node_name = strings [node_name_index ].lower ()
245
+ node_name : Optional [ str ] = strings [node_name_index ].lower ()
216
246
217
247
is_ancestor_of_anchor , anchor_id = add_to_hash_tree (
218
248
anchor_ancestry , "a" , index , node_name , node_parent
@@ -253,7 +283,7 @@ def add_to_hash_tree(hash_tree, tag, node_id, node_name, parent_id):
253
283
if not partially_is_in_viewport :
254
284
continue
255
285
256
- meta_data = []
286
+ meta_data : List [ str ] = []
257
287
258
288
# inefficient to grab the same set of keys for kinds of objects, but it's fine for now
259
289
element_attributes = find_attributes (
@@ -274,7 +304,7 @@ def add_to_hash_tree(hash_tree, tag, node_id, node_name, parent_id):
274
304
else child_nodes .setdefault (str (ancestor_node_key ), [])
275
305
)
276
306
277
- if node_name == "#text" and ancestor_exception :
307
+ if node_name == "#text" and ancestor_exception and ancestor_node :
278
308
text = strings [node_value [index ]]
279
309
if text == "|" or text == "•" :
280
310
continue
@@ -289,7 +319,7 @@ def add_to_hash_tree(hash_tree, tag, node_id, node_name, parent_id):
289
319
) # prevent [button ... (button)..]
290
320
291
321
for key in element_attributes :
292
- if ancestor_exception :
322
+ if ancestor_exception and ancestor_node :
293
323
ancestor_node .append (
294
324
{
295
325
"type" : "attribute" ,
@@ -344,36 +374,32 @@ def add_to_hash_tree(hash_tree, tag, node_id, node_name, parent_id):
344
374
for element in elements_in_view_port :
345
375
node_index = element .get ("node_index" )
346
376
node_name = element .get ("node_name" )
347
- node_value = element .get ("node_value" )
348
- is_clickable = element .get ("is_clickable" )
349
- origin_x = element .get ("origin_x" )
350
- origin_y = element .get ("origin_y" )
351
- center_x = element .get ("center_x" )
352
- center_y = element .get ("center_y" )
353
- meta_data = element .get ("node_meta" )
354
-
355
- inner_text = f"{ node_value } " if node_value else ""
377
+ element_node_value = element .get ("node_value" )
378
+ node_is_clickable = element .get ("is_clickable" )
379
+ node_meta_data : Optional [List [str ]] = element .get ("node_meta" )
380
+
381
+ inner_text = f"{ element_node_value } " if element_node_value else ""
356
382
meta = ""
357
383
358
384
if node_index in child_nodes :
359
- for child in child_nodes . get ( node_index ) :
385
+ for child in child_nodes [ node_index ] :
360
386
entry_type = child .get ("type" )
361
387
entry_value = child .get ("value" )
362
388
363
- if entry_type == "attribute" :
389
+ if entry_type == "attribute" and node_meta_data :
364
390
entry_key = child .get ("key" )
365
- meta_data .append (f'{ entry_key } ="{ entry_value } "' )
391
+ node_meta_data .append (f'{ entry_key } ="{ entry_value } "' )
366
392
else :
367
393
inner_text += f"{ entry_value } "
368
394
369
- if meta_data :
370
- meta_string = " " .join (meta_data )
395
+ if node_meta_data :
396
+ meta_string = " " .join (node_meta_data )
371
397
meta = f" { meta_string } "
372
398
373
399
if inner_text != "" :
374
400
inner_text = f"{ inner_text .strip ()} "
375
401
376
- converted_node_name = convert_name (node_name , is_clickable )
402
+ converted_node_name = convert_name (node_name , node_is_clickable )
377
403
378
404
# not very elegant, more like a placeholder
379
405
if (
0 commit comments