22import re
33import requests
44import time
5+ import datetime
56
67RE = '(http[s]?://[-_?&A-Z0-9a-z./=%]+)'
78RE = re .compile (RE )
2627
2728URL_TEMPLATE = 'https://api.yodaat.org/data/{name}_in_es/data/{filename}.csv'
2829HEADERS = {
29- 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:86 .0) Gecko/20100101 Firefox/86 .0' ,
30+ 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:147 .0) Gecko/20100101 Firefox/147 .0' ,
3031}
3132
3233
@@ -47,6 +48,7 @@ def func(row):
4748 error = None
4849 backoff = 10
4950 try :
51+ print (datetime .datetime .now ().isoformat (), 'Checking' , row ['url' ])
5052 while True :
5153 resp = requests .get (row ['url' ], allow_redirects = True , headers = HEADERS , timeout = 10 , stream = True )
5254 if resp .status_code == 429 :
@@ -55,13 +57,14 @@ def func(row):
5557 continue
5658 if resp .status_code >= 300 :
5759 error = '%s: %s' % (resp .status_code , resp .reason )
60+ time .sleep (1 )
5861 break
5962 except requests .exceptions .RequestException as e :
6063 error = str (e .__class__ .__name__ )
6164 except requests .exceptions .BaseHTTPError as e :
6265 error = str (e .__class__ .__name__ )
6366 if error :
64- print (row ['url' ], error )
67+ print (datetime . datetime . now (). isoformat (), 'ERROR' , row ['url' ], error )
6568 row ['error' ] = error
6669 return func
6770
@@ -75,7 +78,7 @@ def func(r):
7578 return wrapper (title_field )
7679
7780def broken_links_flow ():
78- return DF .Flow (
81+ DF .Flow (
7982 * [
8083 DF .Flow (
8184 DF .load (URL_TEMPLATE .format (** c ), name = c ['name' ]),
@@ -84,6 +87,10 @@ def broken_links_flow():
8487 )
8588 for c in configuration
8689 ],
90+ DF .checkpoint ('broken_links' ),
91+ ).process ()
92+ return DF .Flow (
93+ DF .checkpoint ('broken_links' ),
8794 DF .add_field ('urls' , 'array' , lambda r : RE .findall (str (r ))),
8895 DF .add_field ('link' , 'string' , lambda r : 'https://yodaat.org/item/{doc_id}' .format (** r )),
8996 DF .concatenate (dict (
@@ -96,7 +103,7 @@ def broken_links_flow():
96103 DF .add_field ('error' , 'string' ),
97104 unwind (),
98105 DF .delete_fields (['urls' ]),
99- DF .parallelize (check_broken (), 4 ),
106+ DF .parallelize (check_broken (), 16 ),
100107 DF .filter_rows (lambda r : r ['error' ] is not None ),
101108 )
102109
0 commit comments