Skip to content

Commit 12cc37a

Browse files
authored
Merge pull request #141 from PdxCodeGuild/daniel-mini_capstone
Daniel mini capstone
2 parents a35d918 + 4ffc4a8 commit 12cc37a

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#===================================================================
2+
#Install Libraries
3+
#===================================================================
4+
5+
import re
6+
import requests
7+
from bs4 import BeautifulSoup
8+
import pprint
9+
from requests import get
10+
11+
12+
13+
# https://www.redhat.com/en/contact #email page to scrape
14+
15+
#===================================================================
16+
#Collect HTML w/ Beautiful Soup
17+
#===================================================================
18+
target_webpage = input("Type a target url to scrape the emails from that page: ")
19+
# # print(target_webpage)
20+
21+
target_html = requests.get(target_webpage).text
22+
# print(target_html)
23+
24+
target_html_parsed = BeautifulSoup(target_html, "lxml")
25+
a_tags = target_html_parsed.find_all(['a'])
26+
# print(target_html_parsed)
27+
# print(a_tags)
28+
29+
# print(type(a_tags))
30+
31+
32+
# pprint.pprint((target_html_parsed.prettify()))
33+
# print(target_html_parsed.title.text)
34+
# print(target_html_parsed.get_text())
35+
36+
37+
#===================================================================
38+
#
39+
#===================================================================
40+
41+
regex_pattern = '\S+@\S+'
42+
43+
44+
for tag in a_tags:
45+
a_tag_text = tag.get_text()
46+
# print(type(a_tag_text))
47+
# print("a_tag_text: ", a_tag_text)
48+
email = re.findall(regex_pattern, a_tag_text)
49+
print(email)
50+
51+
52+
53+
#===================================================================
54+
#Regex Examples
55+
#===================================================================
56+
57+
# email = input("Type email: ")
58+
59+
# regex = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
60+
61+
62+
# def isValid(email):
63+
# if re.fullmatch(regex, email):
64+
# print("Valid email")
65+
# else:
66+
# print("Invalid email")
67+
68+
# isValid(email)
69+
70+
# (separator=" ")

0 commit comments

Comments
 (0)