Skip to content

Commit 420e845

Browse files
committed
Add methods for getting status codes from links
1 parent 4553a90 commit 420e845

File tree

2 files changed

+122
-0
lines changed

2 files changed

+122
-0
lines changed

seleniumbase/fixtures/base_case.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1622,6 +1622,43 @@ def get_beautiful_soup(self, source=None):
16221622
soup = BeautifulSoup(source, "html.parser")
16231623
return soup
16241624

1625+
def get_unique_links(self):
1626+
""" Get all unique links in the html of the page source.
1627+
Page links include those obtained from:
1628+
"a"->"href", "img"->"src", "link"->"href", and "script"->"src". """
1629+
page_url = self.get_current_url()
1630+
soup = self.get_beautiful_soup(self.get_page_source())
1631+
links = page_utils._get_unique_links(page_url, soup)
1632+
return links
1633+
1634+
def get_link_status_code(self, link, allow_redirects=False, timeout=5):
1635+
""" Get the status code of a link.
1636+
If the timeout is exceeded, will return a 404.
1637+
For a list of available status codes, see:
1638+
https://en.wikipedia.org/wiki/List_of_HTTP_status_codes """
1639+
status_code = page_utils._get_link_status_code(
1640+
link, allow_redirects=allow_redirects, timeout=timeout)
1641+
return status_code
1642+
1643+
def assert_no_404_errors(self):
1644+
""" Assert no 404 errors from page links obtained from:
1645+
"a"->"href", "img"->"src", "link"->"href", and "script"->"src". """
1646+
links = self.get_unique_links()
1647+
for link in links:
1648+
status_code = str(self.get_link_status_code(link))
1649+
bad_link_str = 'Error: "%s" returned a 404!' % link
1650+
self.assert_not_equal(status_code, "404", bad_link_str)
1651+
1652+
def print_unique_links_with_status_codes(self):
1653+
""" Finds all unique links in the html of the page source
1654+
and then prints out those links with their status codes.
1655+
Format: ["link" -> "status_code"] (per line)
1656+
Page links include those obtained from:
1657+
"a"->"href", "img"->"src", "link"->"href", and "script"->"src". """
1658+
page_url = self.get_current_url()
1659+
soup = self.get_beautiful_soup(self.get_page_source())
1660+
page_utils._print_unique_links_with_status_codes(page_url, soup)
1661+
16251662
def safe_execute_script(self, script):
16261663
""" When executing a script that contains a jQuery command,
16271664
it's important that the jQuery library has been loaded first.

seleniumbase/fixtures/page_utils.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,91 @@ def is_valid_url(url):
6565
return False
6666

6767

68+
def _get_unique_links(page_url, soup):
69+
"""
70+
Returns all unique links.
71+
Includes:
72+
"a"->"href", "img"->"src", "link"->"href", and "script"->"src" links.
73+
"""
74+
prefix = 'http:'
75+
if page_url.startswith('https:'):
76+
prefix = 'https:'
77+
simple_url = page_url.split('://')[1]
78+
base_url = simple_url.split('/')[0]
79+
full_base_url = prefix + "//" + base_url
80+
81+
raw_links = []
82+
raw_unique_links = []
83+
84+
# Get "href" from all "a" tags
85+
links = soup.find_all('a')
86+
for link in links:
87+
raw_links.append(link.get('href'))
88+
89+
# Get "src" from all "img" tags
90+
img_links = soup.find_all('img')
91+
for img_link in img_links:
92+
raw_links.append(img_link.get('src'))
93+
94+
# Get "href" from all "link" tags
95+
links = soup.find_all('link')
96+
for link in links:
97+
raw_links.append(link.get('href'))
98+
99+
# Get "src" from all "script" tags
100+
img_links = soup.find_all('script')
101+
for img_link in img_links:
102+
raw_links.append(img_link.get('src'))
103+
104+
for link in raw_links:
105+
if link not in raw_unique_links:
106+
raw_unique_links.append(link)
107+
108+
unique_links = []
109+
for link in raw_unique_links:
110+
if link and len(link) > 1:
111+
if link.startswith('//'):
112+
link = prefix + link
113+
elif link.startswith('/'):
114+
link = full_base_url + link
115+
elif link.startswith('#'):
116+
link = full_base_url + link
117+
else:
118+
pass
119+
unique_links.append(link)
120+
121+
return unique_links
122+
123+
124+
def _get_link_status_code(link, allow_redirects=False, timeout=5):
125+
""" Get the status code of a link.
126+
If the timeout is exceeded, will return a 404.
127+
For a list of available status codes, see:
128+
https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
129+
"""
130+
status_code = None
131+
try:
132+
response = requests.get(
133+
link, allow_redirects=allow_redirects, timeout=timeout)
134+
status_code = response.status_code
135+
except Exception:
136+
status_code = 404
137+
return status_code
138+
139+
140+
def _print_unique_links_with_status_codes(page_url, soup):
141+
""" Finds all unique links in the html of the page source
142+
and then prints out those links with their status codes.
143+
Format: ["link" -> "status_code"] (per line)
144+
Page links include those obtained from:
145+
"a"->"href", "img"->"src", "link"->"href", and "script"->"src".
146+
"""
147+
links = _get_unique_links(page_url, soup)
148+
for link in links:
149+
status_code = _get_link_status_code(link)
150+
print(link, " -> ", status_code)
151+
152+
68153
def _download_file_to(file_url, destination_folder, new_file_name=None):
69154
if new_file_name:
70155
file_name = new_file_name

0 commit comments

Comments
 (0)