1
- import re # for regular expression matching operations
2
- # import requests # for sending HTTP requests
3
- # from urllib.parse import urlsplit # for breaking URLs down into component parts
4
- # from collections import deque # is a list-like container with fast appends and pops on either end
5
- # from bs4 import BeautifulSoup # for pulling data out of HTML files of websites
1
+ #===================================================================
2
+ #Install Libraries
3
+ #===================================================================
4
+
5
+ import re
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import pprint
9
+ from requests import get
10
+
11
+
12
+
13
+ # https://www.redhat.com/en/contact #email page to scrape
14
+
15
+ #===================================================================
16
+ #Collect HTML w/ Beautiful Soup
17
+ #===================================================================
18
+ target_webpage = input ("Type a target url to scrape the emails from that page: " )
19
+ # # print(target_webpage)
20
+
21
+ target_html = requests .get (target_webpage ).text
22
+ # print(target_html)
6
23
24
+ target_html_parsed = BeautifulSoup (target_html , "lxml" )
25
+ a_tags = target_html_parsed .find_all (['a' ])
26
+ # print(target_html_parsed)
27
+ # print(a_tags)
7
28
29
+ # print(type(a_tags))
30
+
31
+
32
+ # pprint.pprint((target_html_parsed.prettify()))
33
+ # print(target_html_parsed.title.text)
34
+ # print(target_html_parsed.get_text())
35
+
36
+
37
+ #===================================================================
38
+ #
39
+ #===================================================================
40
+
41
+ regex_pattern = '\S+@\S+'
42
+
43
+
44
+ for tag in a_tags :
45
+ a_tag_text = tag .get_text ()
46
+ # print(type(a_tag_text))
47
+ # print("a_tag_text: ", a_tag_text)
48
+ email = re .findall (regex_pattern , a_tag_text )
49
+ print (email )
8
50
9
- # target_webpage = input("Enter a target url to have the emails scraped: ")
10
- # unscraped_url = deque(target_webpage)
11
- # scraped_url = set()
12
- # scraped_email = set()
13
51
14
52
15
53
#===================================================================
16
54
#Regex Examples
17
55
#===================================================================
18
56
19
- email = input ("Type email: " )
57
+ # email = input("Type email: ")
58
+
59
+ # regex = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')
20
60
21
- regex = re .compile (r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+' )
22
61
62
+ # def isValid(email):
63
+ # if re.fullmatch(regex, email):
64
+ # print("Valid email")
65
+ # else:
66
+ # print("Invalid email")
23
67
24
- def isValid (email ):
25
- if re .fullmatch (regex , email ):
26
- print ("Valid email" )
27
- else :
28
- print ("Invalid email" )
68
+ # isValid(email)
29
69
30
- isValid ( email )
70
+ # (separator=" " )
0 commit comments