@@ -65,6 +65,91 @@ def is_valid_url(url):
6565 return False
6666
6767
68+ def _get_unique_links (page_url , soup ):
69+ """
70+ Returns all unique links.
71+ Includes:
72+ "a"->"href", "img"->"src", "link"->"href", and "script"->"src" links.
73+ """
74+ prefix = 'http:'
75+ if page_url .startswith ('https:' ):
76+ prefix = 'https:'
77+ simple_url = page_url .split ('://' )[1 ]
78+ base_url = simple_url .split ('/' )[0 ]
79+ full_base_url = prefix + "//" + base_url
80+
81+ raw_links = []
82+ raw_unique_links = []
83+
84+ # Get "href" from all "a" tags
85+ links = soup .find_all ('a' )
86+ for link in links :
87+ raw_links .append (link .get ('href' ))
88+
89+ # Get "src" from all "img" tags
90+ img_links = soup .find_all ('img' )
91+ for img_link in img_links :
92+ raw_links .append (img_link .get ('src' ))
93+
94+ # Get "href" from all "link" tags
95+ links = soup .find_all ('link' )
96+ for link in links :
97+ raw_links .append (link .get ('href' ))
98+
99+ # Get "src" from all "script" tags
100+ img_links = soup .find_all ('script' )
101+ for img_link in img_links :
102+ raw_links .append (img_link .get ('src' ))
103+
104+ for link in raw_links :
105+ if link not in raw_unique_links :
106+ raw_unique_links .append (link )
107+
108+ unique_links = []
109+ for link in raw_unique_links :
110+ if link and len (link ) > 1 :
111+ if link .startswith ('//' ):
112+ link = prefix + link
113+ elif link .startswith ('/' ):
114+ link = full_base_url + link
115+ elif link .startswith ('#' ):
116+ link = full_base_url + link
117+ else :
118+ pass
119+ unique_links .append (link )
120+
121+ return unique_links
122+
123+
124+ def _get_link_status_code (link , allow_redirects = False , timeout = 5 ):
125+ """ Get the status code of a link.
126+ If the timeout is exceeded, will return a 404.
127+ For a list of available status codes, see:
128+ https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
129+ """
130+ status_code = None
131+ try :
132+ response = requests .get (
133+ link , allow_redirects = allow_redirects , timeout = timeout )
134+ status_code = response .status_code
135+ except Exception :
136+ status_code = 404
137+ return status_code
138+
139+
140+ def _print_unique_links_with_status_codes (page_url , soup ):
141+ """ Finds all unique links in the html of the page source
142+ and then prints out those links with their status codes.
143+ Format: ["link" -> "status_code"] (per line)
144+ Page links include those obtained from:
145+ "a"->"href", "img"->"src", "link"->"href", and "script"->"src".
146+ """
147+ links = _get_unique_links (page_url , soup )
148+ for link in links :
149+ status_code = _get_link_status_code (link )
150+ print (link , " -> " , status_code )
151+
152+
68153def _download_file_to (file_url , destination_folder , new_file_name = None ):
69154 if new_file_name :
70155 file_name = new_file_name
0 commit comments