Skip to content

Commit 6833ccb

Browse files
committed
scrape works
1 parent 4c9ab3b commit 6833ccb

File tree

1 file changed

+57
-17
lines changed

1 file changed

+57
-17
lines changed
Lines changed: 57 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,70 @@
1-
import re # for regular expression matching operations
2-
# import requests # for sending HTTP requests
3-
# from urllib.parse import urlsplit # for breaking URLs down into component parts
4-
# from collections import deque # is a list-like container with fast appends and pops on either end
5-
# from bs4 import BeautifulSoup # for pulling data out of HTML files of websites
1+
#===================================================================
2+
#Install Libraries
3+
#===================================================================
4+
5+
import re
6+
import requests
7+
from bs4 import BeautifulSoup
8+
import pprint
9+
from requests import get
10+
11+
12+
13+
# https://www.redhat.com/en/contact #email page to scrape
14+
15+
#===================================================================
16+
#Collect HTML w/ Beautiful Soup
17+
#===================================================================
18+
target_webpage = input("Type a target url to scrape the emails from that page: ")
19+
# # print(target_webpage)
20+
21+
target_html = requests.get(target_webpage).text
22+
# print(target_html)
623

24+
target_html_parsed = BeautifulSoup(target_html, "lxml")
25+
a_tags = target_html_parsed.find_all(['a'])
26+
# print(target_html_parsed)
27+
# print(a_tags)
728

29+
# print(type(a_tags))
30+
31+
32+
# pprint.pprint((target_html_parsed.prettify()))
33+
# print(target_html_parsed.title.text)
34+
# print(target_html_parsed.get_text())
35+
36+
37+
#===================================================================
38+
#
39+
#===================================================================
40+
41+
regex_pattern = '\S+@\S+'
42+
43+
44+
for tag in a_tags:
45+
a_tag_text = tag.get_text()
46+
# print(type(a_tag_text))
47+
# print("a_tag_text: ", a_tag_text)
48+
email = re.findall(regex_pattern, a_tag_text)
49+
print(email)
850

9-
# target_webpage = input("Enter a target url to have the emails scraped: ")
10-
# unscraped_url = deque(target_webpage)
11-
# scraped_url = set()
12-
# scraped_email = set()
1351

1452

1553
#===================================================================
1654
#Regex Examples
1755
#===================================================================
1856

19-
email = input("Type email: ")
57+
# email = input("Type email: ")
58+
59+
# regex = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
2060

21-
regex = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
2261

62+
# def isValid(email):
63+
# if re.fullmatch(regex, email):
64+
# print("Valid email")
65+
# else:
66+
# print("Invalid email")
2367

24-
def isValid(email):
25-
if re.fullmatch(regex, email):
26-
print("Valid email")
27-
else:
28-
print("Invalid email")
68+
# isValid(email)
2969

30-
isValid(email)
70+
# (separator=" ")

0 commit comments

Comments
 (0)