-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcheck_links_in_ipynb.py
More file actions
76 lines (59 loc) · 1.98 KB
/
check_links_in_ipynb.py
File metadata and controls
76 lines (59 loc) · 1.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re
import urllib.parse as up
import urllib.request as ur
import nbformat
import requests
def is_cell_markdown(cell):
"""
Is the ipynb cell Markdown?
"""
return cell['cell_type'].strip().lower().startswith('markdown')
def get_re_markdown_simple_link():
"""
Regular Expression to find markdown urls
[text](url)
"""
return re.compile(r'\[.+?\]\((.+?)\)')
def get_re_markdown_image_link():
"""
Regular Expression to find urls linked to images
[](url)
"""
return re.compile(r'\[\!\[.+?\]\(.+?\)\]\((.+?)\)')
# to avoid compiling repetitively
ri = get_re_markdown_image_link()
rs = get_re_markdown_simple_link()
def check_link_in_cell(cell, r):
"""
cell : ipynb cell
r : regex. return value from get_re_markdown_simple_link() or get_re_markdown_image_link()
"""
# url match loop
for m in r.finditer(cell['source']):
# try to open url part of the match
req = requests.get(up.unquote(m.group(1)), timeout=60)
# https://2.python-requests.org/en/master/user/quickstart/#response-status-codes
req.raise_for_status()
def check_links_in_ipynb(filename):
"""
filename : path to an ipynb file
"""
# open file and read
with open(filename, encoding='utf-8') as ipynb:
nb = nbformat.read(ipynb, nbformat.NO_CONVERT)
check_links_in_ipynb_cells_list(nb['cells'])
def check_links_in_ipynb_cells_list(cells_list):
"""
cells_list : ipynb notebook's cells
>>> with open(filename, encoding='utf-8') as ipynb:
nb = nbformat.read(ipynb, nbformat.NO_CONVERT)
>>> check_links_in_ipynb_cells_list(nb['cells'])
"""
# cell loop
for cell in filter(is_cell_markdown, cells_list):
# see if the cell has links
# https://stackoverflow.com/questions/16778435/python-check-if-website-exists
# check simple urls
check_link_in_cell(cell, rs)
# check urls linked to
check_link_in_cell(cell, ri)