@@ -65,6 +65,91 @@ def is_valid_url(url):
65
65
return False
66
66
67
67
68
+ def _get_unique_links (page_url , soup ):
69
+ """
70
+ Returns all unique links.
71
+ Includes:
72
+ "a"->"href", "img"->"src", "link"->"href", and "script"->"src" links.
73
+ """
74
+ prefix = 'http:'
75
+ if page_url .startswith ('https:' ):
76
+ prefix = 'https:'
77
+ simple_url = page_url .split ('://' )[1 ]
78
+ base_url = simple_url .split ('/' )[0 ]
79
+ full_base_url = prefix + "//" + base_url
80
+
81
+ raw_links = []
82
+ raw_unique_links = []
83
+
84
+ # Get "href" from all "a" tags
85
+ links = soup .find_all ('a' )
86
+ for link in links :
87
+ raw_links .append (link .get ('href' ))
88
+
89
+ # Get "src" from all "img" tags
90
+ img_links = soup .find_all ('img' )
91
+ for img_link in img_links :
92
+ raw_links .append (img_link .get ('src' ))
93
+
94
+ # Get "href" from all "link" tags
95
+ links = soup .find_all ('link' )
96
+ for link in links :
97
+ raw_links .append (link .get ('href' ))
98
+
99
+ # Get "src" from all "script" tags
100
+ img_links = soup .find_all ('script' )
101
+ for img_link in img_links :
102
+ raw_links .append (img_link .get ('src' ))
103
+
104
+ for link in raw_links :
105
+ if link not in raw_unique_links :
106
+ raw_unique_links .append (link )
107
+
108
+ unique_links = []
109
+ for link in raw_unique_links :
110
+ if link and len (link ) > 1 :
111
+ if link .startswith ('//' ):
112
+ link = prefix + link
113
+ elif link .startswith ('/' ):
114
+ link = full_base_url + link
115
+ elif link .startswith ('#' ):
116
+ link = full_base_url + link
117
+ else :
118
+ pass
119
+ unique_links .append (link )
120
+
121
+ return unique_links
122
+
123
+
124
+ def _get_link_status_code (link , allow_redirects = False , timeout = 5 ):
125
+ """ Get the status code of a link.
126
+ If the timeout is exceeded, will return a 404.
127
+ For a list of available status codes, see:
128
+ https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
129
+ """
130
+ status_code = None
131
+ try :
132
+ response = requests .get (
133
+ link , allow_redirects = allow_redirects , timeout = timeout )
134
+ status_code = response .status_code
135
+ except Exception :
136
+ status_code = 404
137
+ return status_code
138
+
139
+
140
+ def _print_unique_links_with_status_codes (page_url , soup ):
141
+ """ Finds all unique links in the html of the page source
142
+ and then prints out those links with their status codes.
143
+ Format: ["link" -> "status_code"] (per line)
144
+ Page links include those obtained from:
145
+ "a"->"href", "img"->"src", "link"->"href", and "script"->"src".
146
+ """
147
+ links = _get_unique_links (page_url , soup )
148
+ for link in links :
149
+ status_code = _get_link_status_code (link )
150
+ print (link , " -> " , status_code )
151
+
152
+
68
153
def _download_file_to (file_url , destination_folder , new_file_name = None ):
69
154
if new_file_name :
70
155
file_name = new_file_name
0 commit comments