-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathSafewayWebScrapping.py
More file actions
119 lines (92 loc) · 4.22 KB
/
SafewayWebScrapping.py
File metadata and controls
119 lines (92 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#to get this to work, you are going to need to intall bs4 and requests
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
filename = "products.csv"
f = open(filename, "w")
handlers = "item, calories,fats, carbs, protein\n"
f.write(handlers)
def read_link(url):
Client = uReq(url)
page_data = Client.read()
Client.close()
page_data_soup = soup(page_data,"html.parser")
return page_data_soup
def Get_Data_from_item_page(page):
product_soup = read_link(page)
product_containers = product_soup.findAll("table", {"class": "tableOfIngredients"})
product_name_containers = product_soup.find("div", {"class": "product-heading"})
product_name = product_name_containers.h2.text
if len(product_containers) != 0:
product_info = product_containers[0]
# this will get you the number of calories
calories =product_info.find("td", {"class": "table-ingredients-text"}).text
# This will get you all the data for the table, it returns and array
raw_data = product_info.findAll("td", {"class": "table-ingredients-text"})
# Explain what this does and my process for the index
fats = raw_data[4].text
if 'Dietary Fiber' in raw_data[24].text:
carbs = raw_data[25].text
else:
carbs = raw_data[24].text
if raw_data[30] == '':
protien = "0g"
else:
protien = raw_data[30].text
f.write(product_name + "," + CalorieFormat(calories) + "," + FormatInfo(fats) + "," + FormatInfo(carbs) + "," + FormatInfo(protien) + "\n")
return product_name, calories, fats, carbs, protien
else:
return 0
def FormatInfo(macro):
macro = macro.replace("Amount Per serving", "")
macro2 = macro.replace('(-)', '0g')
return macro2.lower()
def CalorieFormat(calorie):
new_format = calorie.replace("Amount Per Serving\n", "")
new_new_format = new_format.replace(' ', '')
return new_new_format
def GetSafewayLink(parital_url_container):
try:
internal_link = parital_url_container['href']
return "https://safeway.com" + internal_link
except ValueError:
print('Invalid container, it does not have a link, does not have attribute href')
def GetItemsFromPage(url):
page_soup = read_link(url)
containers = page_soup.findAll("a", {"class": "product-title"})
len(containers)
# we can see that there are 33 items per page, so we must continue to increase until we get to 33*n = total items
safeway_link = []
name = []
info = []
for contain in containers:
new_link = GetSafewayLink(contain)
safeway_link.append(new_link)
data = Get_Data_from_item_page(new_link)
if data != 0:
info.append(data)
name.append(contain.text)
def GetSubitemsFromPage(url):
page_soup = read_link(url)
containers = page_soup.findAll("a",{"class": "siblingAisle"})
Product_blacklist = ['Baby', 'Coffee', 'Deli', 'Flowers','Fruits & Vegetables', 'Eggs' ,'Meat & Seafood', 'Beef', 'Laundry', 'Care', 'Pet', 'Wine', 'Beer']
for contain in containers:
test = any([Product_blacklist_item in Product_genre for Product_blacklist_item in Product_blacklist])
if not test:
new_link = GetSafewayLink(contain)
print(new_link)
GetItemsFromPage(new_link)
a_whole_new_link = 'https://www.safeway.com/shop/aisles.3132.html'
a_whole_new_soup = read_link(a_whole_new_link)
a_whole_new_container = a_whole_new_soup.findAll("a", {"class": "text-uppercase view-all-subcats"})
Product_blacklist = ['Baby', 'Coffee', 'Deli', 'Flowers','Fruits & Vegetables', 'Eggs' ,'Meat & Seafood', 'Beef', 'Laundry', 'Care', 'Pet', 'Wine', 'Beer',]
for contain in a_whole_new_container:
print(contain)
for new_container in a_whole_new_container:
Product_genre = new_container['aria-label']
print(Product_genre)
test = any([Product_blacklist_item in Product_genre for Product_blacklist_item in Product_blacklist])
if not test:
link = GetSafewayLink(new_container)
print(link)
GetSubitemsFromPage(link)