Skip to content

Commit f5a6c45

Browse files
committed
borken links
1 parent e1cb0ff commit f5a6c45

File tree

1 file changed

+11
-4
lines changed

1 file changed

+11
-4
lines changed

datapackage_pipelines_migdar/flows/broken_links.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import re
33
import requests
44
import time
5+
import datetime
56

67
RE = '(http[s]?://[-_?&A-Z0-9a-z./=%]+)'
78
RE = re.compile(RE)
@@ -26,7 +27,7 @@
2627

2728
URL_TEMPLATE='https://api.yodaat.org/data/{name}_in_es/data/{filename}.csv'
2829
HEADERS = {
29-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:86.0) Gecko/20100101 Firefox/86.0',
30+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:147.0) Gecko/20100101 Firefox/147.0',
3031
}
3132

3233

@@ -47,6 +48,7 @@ def func(row):
4748
error = None
4849
backoff = 10
4950
try:
51+
print(datetime.datetime.now().isoformat(), 'Checking', row['url'])
5052
while True:
5153
resp = requests.get(row['url'], allow_redirects=True, headers=HEADERS, timeout=10, stream=True)
5254
if resp.status_code == 429:
@@ -55,13 +57,14 @@ def func(row):
5557
continue
5658
if resp.status_code >= 300:
5759
error = '%s: %s' % (resp.status_code, resp.reason)
60+
time.sleep(1)
5861
break
5962
except requests.exceptions.RequestException as e:
6063
error = str(e.__class__.__name__)
6164
except requests.exceptions.BaseHTTPError as e:
6265
error = str(e.__class__.__name__)
6366
if error:
64-
print(row['url'], error)
67+
print(datetime.datetime.now().isoformat(), 'ERROR', row['url'], error)
6568
row['error'] = error
6669
return func
6770

@@ -75,7 +78,7 @@ def func(r):
7578
return wrapper(title_field)
7679

7780
def broken_links_flow():
78-
return DF.Flow(
81+
DF.Flow(
7982
*[
8083
DF.Flow(
8184
DF.load(URL_TEMPLATE.format(**c), name=c['name']),
@@ -84,6 +87,10 @@ def broken_links_flow():
8487
)
8588
for c in configuration
8689
],
90+
DF.checkpoint('broken_links'),
91+
).process()
92+
return DF.Flow(
93+
DF.checkpoint('broken_links'),
8794
DF.add_field('urls', 'array', lambda r: RE.findall(str(r))),
8895
DF.add_field('link', 'string', lambda r: 'https://yodaat.org/item/{doc_id}'.format(**r)),
8996
DF.concatenate(dict(
@@ -96,7 +103,7 @@ def broken_links_flow():
96103
DF.add_field('error', 'string'),
97104
unwind(),
98105
DF.delete_fields(['urls']),
99-
DF.parallelize(check_broken(), 4),
106+
DF.parallelize(check_broken(), 16),
100107
DF.filter_rows(lambda r: r['error'] is not None),
101108
)
102109

0 commit comments

Comments
 (0)