Skip to content

Commit 2b88cb2

Browse files
added scripts
1 parent ac7461c commit 2b88cb2

File tree

7 files changed

+386
-15
lines changed

7 files changed

+386
-15
lines changed

FlipkartScraper/dbConnector.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import sqlite3
2+
import os
3+
4+
class FlipkartDatabaseConnector:
5+
def __init__(self, stamp):
6+
self.dbPath = "flipkart.db"
7+
self.conn = sqlite3.connect(self.dbPath)
8+
self.cur = self.conn.cursor()
9+
self.welcomeMessage = "Welcome to Flipkart Scraper. This is the database for the Flipkart Scraper. This database was created on {}.".format(stamp)
10+
11+
def schemaMaker(self):
12+
# creating tables
13+
self.cur.execute("""CREATE TABLE products (
14+
id INTEGER PRIMARY KEY AUTOINCREMENT,
15+
sku TEXT NOT NULL,
16+
name TEXT NOT NULL,
17+
description TEXT NOT NULL,
18+
image_path TEXT NOT NULL,
19+
category TEXT NOT NULL,
20+
timestamp TEXT NOT NULL,
21+
URL TEXT NOT NULL,
22+
price TEXT NOT NULL
23+
);""")
24+
self.conn.commit()
25+
self.cur.execute("CREATE TABLE product_matches (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, product_sku INTEGER NOT NULL, match_id INTEGER NOT NULL, match_sku INTEGER NOT NULL);")
26+
self.conn.commit()
27+
28+
def insertProduct(self, productDetails):
29+
self.cur.execute("INSERT INTO products (sku, name, description, image_path, category, timestamp, URL, price) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (productDetails["sku"], productDetails["name"], productDetails["description"], productDetails["image_path"], productDetails["category"], productDetails["timestamp"], productDetails["URL"], productDetails["price"]))
30+
self.conn.commit()
31+
32+
def fetchAllProducts(self):
33+
self.cur.execute("SELECT * FROM products")
34+
return self.cur.fetchall()
35+
36+
def clearDatabase(self):
37+
self.cur.execute("DELETE FROM products")
38+
self.conn.commit()
39+
self.cur.execute("DELETE FROM product_matches")
40+
self.conn.commit()
41+
42+
def removeDuplicates(self):
43+
self.cur.execute("DELETE FROM products WHERE rowid NOT IN (SELECT MIN(rowid) FROM products GROUP BY sku)")
44+
self.conn.commit()

FlipkartScraper/flipkart.db

0 Bytes
Binary file not shown.

FlipkartScraper/genricHtmlib.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
from multiprocessing import Pool
2+
import os
3+
from datetime import datetime
4+
import lxml.html as html
5+
import pandas as pd
6+
import time
7+
from selenium import webdriver
8+
from selenium.webdriver.chrome.options import Options
9+
import warnings
10+
import requests
11+
warnings.filterwarnings("ignore")
12+
13+
class SeleniumScraper:
14+
def __init__(self, timeout=10):
15+
self.timeout = timeout
16+
self.reqSession = requests.Session()
17+
self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
18+
self.storagePath = os.path.join(
19+
os.path.dirname(os.path.abspath(__file__))
20+
)
21+
22+
self.headers = {
23+
'authority': 'www.amazon.com',
24+
'pragma': 'no-cache',
25+
'cache-control': 'no-cache',
26+
'dnt': '1',
27+
'upgrade-insecure-requests': '1',
28+
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
29+
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
30+
'sec-fetch-site': 'none',
31+
'sec-fetch-mode': 'navigate',
32+
'sec-fetch-dest': 'document',
33+
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
34+
}
35+
36+
def fetch_request_normal(self, url, params=None):
37+
try:
38+
headers = {
39+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
40+
}
41+
response = self.reqSession.get(url, headers=headers)
42+
43+
if response.status_code == 200:
44+
return response.text
45+
46+
if response.status_code == 301:
47+
# retry with redirect
48+
response = requests.get(response.headers['Location'])
49+
response.raise_for_status()
50+
if response.status_code == 200:
51+
return response.text
52+
53+
if response.status_code == 503:
54+
#print("Request Failed Response status code for url: {} and status code: {}".format(url, 503))
55+
return None
56+
57+
except Exception as e:
58+
print(
59+
"Exception occurred for url: {} and exception: {}".format(url, e)
60+
)
61+
print("Exception occurred for url: {} and exception: {}".format(url, e))
62+
pass
63+
return None
64+
65+
def get_xpath_link(self, doc, xpath, website):
66+
try:
67+
name = doc.xpath("".join(xpath))
68+
for i in range(len(name)):
69+
if name[i].startswith("/"):
70+
name[i] = website + name[i]
71+
else:
72+
name[i] = name[i]
73+
return name
74+
75+
except Exception as e:
76+
print("Error in getting {}: {}".format(name, e))
77+
pass
78+
return None
79+
pass
80+
81+
def get_selenium_driver(self):
82+
chrome_options = Options()
83+
chrome_options.add_argument("--headless")
84+
chrome_options.add_argument("--window-size=1920,1080")
85+
chrome_options.add_argument("--disable-gpu")
86+
chrome_options.add_argument("--no-sandbox")
87+
chrome_options.add_argument("--disable-dev-shm-usage")
88+
chrome_options.add_argument("--disable-extensions")
89+
chrome_options.add_argument("--disable-logging")
90+
chrome_options.add_argument("--log-level=3")
91+
chrome_options.add_argument("--silent")
92+
chrome_options.add_argument("--blink-settings=imagesEnabled=false")
93+
driver = webdriver.Chrome(chrome_options=chrome_options)
94+
return driver
95+
96+
def fetch_request_selenium(self, url, waiting_time=1):
97+
try:
98+
driver = self.get_selenium_driver()
99+
driver.get(url)
100+
time.sleep(waiting_time)
101+
doc = html.fromstring(driver.page_source)
102+
driver.close()
103+
return doc
104+
105+
except Exception as e:
106+
print(
107+
"Exception occurred for url: {} and exception: {}".format(url, e)
108+
)
109+
pass
110+
111+
def get_xpath_data(self, doc, xpath):
112+
try:
113+
name = doc.xpath(xpath)
114+
return name
115+
116+
except Exception as e:
117+
print("Error in getting {}: {}".format(name, e))
118+
pass
119+
return None
120+
121+
def slow_page_scroll(self, driver, speed):
122+
current_scroll_position = driver.execute_script("return window.pageYOffset;")
123+
while current_scroll_position < driver.execute_script(
124+
"return document.body.scrollHeight;"
125+
):
126+
driver.execute_script(
127+
"window.scrollTo(0, arguments[0]);", current_scroll_position
128+
)
129+
current_scroll_position += 1000
130+
time.sleep(speed)
131+
132+
def data_storage(self, df_list, unique_id, name, storageFormat, storagePath=None):
133+
df_combined = pd.concat(df_list, ignore_index=True)
134+
df_combined.drop_duplicates(subset=unique_id, inplace=True)
135+
if storageFormat == "csv":
136+
df_combined.to_csv(
137+
self.storagePath +"/{}_{}.csv".format(name, self.stamp),
138+
index=False,
139+
)
140+
elif storageFormat == "json":
141+
df_combined.to_json(
142+
self.storagePath + "/{}_{}.json".format(name, self.stamp),
143+
orient="records",
144+
)
145+
146+
def cleanData(self, array):
147+
array = [x.strip() for x in array]
148+
array = list(filter(None, array))
149+
array = [x.encode("ascii", "ignore").decode() for x in array]
150+
array = [x.replace("\n", "") for x in array]
151+
return array
152+
153+

FlipkartScraper/main.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import logging
2+
from datetime import datetime
3+
from dbConnector import FlipkartDatabaseConnector
4+
from productList import product_categories
5+
from genricHtmlib import SeleniumScraper
6+
import os
7+
import lxml.html as html
8+
import concurrent.futures
9+
10+
SeleniumScraper = SeleniumScraper()
11+
12+
class Scraper:
13+
def __init__(self):
14+
self.brand: str = "flipkart"
15+
self.website = "https://www.flipkart.com/search?q="
16+
self.websiteName = "https://www.flipkart.com"
17+
self.stamp: str = datetime.now().strftime("%Y-%ma-%d_%H-%M-%S")
18+
self.storagePath: str = os.getcwd()
19+
20+
self.productLinksXpath = '//*[@rel="noopener noreferrer"]//@href'
21+
self.skuXpath = '//tr[contains(@class, "row")]//td[contains(text(), "Model Number")]/following-sibling::td[1]/ul/li/text()'
22+
self.nameXpath = '//*[@class="B_NuCI"]//text()'
23+
self.description = '//div[contains(text(), "Description")]/following-sibling::div[1]/div/text()'
24+
self.image = '//*[@class="_396cs4 _2amPTt _3qGmMb"]//@src'
25+
self.category = '//*[@class="_3GIHBu"]//text()'
26+
self.price = '//*[@class="_30jeq3 _16Jk6d"]//text()'
27+
28+
def getProductList(self, keyword):
29+
try:
30+
productLinks = []
31+
url = self.website + keyword
32+
response = SeleniumScraper.fetch_request_normal(url)
33+
if response is None:
34+
doc = SeleniumScraper.fetch_request_selenium(url)
35+
else:
36+
doc = html.fromstring(response)
37+
38+
Links = SeleniumScraper.get_xpath_link(doc, self.productLinksXpath, self.websiteName)
39+
productLinks.extend(Links)
40+
41+
for page in range(2, 20):
42+
print(f'Geting Page {page} for {keyword}')
43+
url = self.website + keyword + "&page=" + str(page)
44+
response = SeleniumScraper.fetch_request_normal(url)
45+
if response is None:
46+
doc = SeleniumScraper.fetch_request_selenium(url)
47+
else:
48+
doc = html.fromstring(response)
49+
50+
Links = SeleniumScraper.get_xpath_link(doc, self.productLinksXpath, self.websiteName)
51+
productLinks.extend(Links)
52+
53+
print(f'Total products for {keyword} is {len(productLinks)}')
54+
return productLinks
55+
56+
except Exception as e:
57+
print(e)
58+
59+
def getProductDetails(self, productLink):
60+
print(f'Getting product details for {productLink}')
61+
response = SeleniumScraper.fetch_request_normal(productLink)
62+
if response is None:
63+
doc = SeleniumScraper.fetch_request_selenium(productLink)
64+
else:
65+
doc = html.fromstring(response)
66+
67+
productDetails = {}
68+
69+
try:
70+
sku = SeleniumScraper.get_xpath_data(doc ,self.skuXpath)
71+
sku = sku[0]
72+
except:
73+
sku = "None"
74+
75+
try:
76+
name = SeleniumScraper.get_xpath_data(doc ,self.nameXpath)
77+
name = name[0]
78+
except:
79+
name = "None"
80+
81+
try:
82+
description = SeleniumScraper.get_xpath_data(doc, self.description)
83+
description = ''.join(description)
84+
except:
85+
description = "None"
86+
87+
try:
88+
image_path = SeleniumScraper.get_xpath_link(doc, self.image, self.websiteName)
89+
image_path = image_path[0]
90+
except:
91+
image_path = "None"
92+
93+
try:
94+
category = SeleniumScraper.get_xpath_data(doc, self.category)
95+
category = category[1]
96+
except:
97+
category = "None"
98+
99+
try:
100+
price = SeleniumScraper.get_xpath_data(doc, self.price)
101+
price = SeleniumScraper.cleanData(price)
102+
price = price[0]
103+
except:
104+
price = "None"
105+
106+
productDetails["sku"] = str(sku)
107+
productDetails["name"] = str(name)
108+
productDetails["description"] = str(description)
109+
productDetails["image_path"] = str(image_path)
110+
productDetails["category"] = str(category)
111+
productDetails["timestamp"] = str(self.stamp)
112+
productDetails["URL"] = str(productLink)
113+
productDetails['price'] = price
114+
115+
print(productDetails)
116+
return productDetails
117+
118+
def start(self):
119+
productList = []
120+
number_of_threads: int = 1
121+
122+
# Log start of scraper
123+
print(f"Starting {self.brand} scraper")
124+
125+
# make db amazon.db if it doesn't exist
126+
if not os.path.exists(self.storagePath + "/" + self.brand + ".db"):
127+
print(f'Creating {self.brand}.db at {self.storagePath+self.brand+".db"}')
128+
db = FlipkartDatabaseConnector(self.stamp)
129+
db.schemaMaker()
130+
print(db.welcomeMessage)
131+
132+
self.db = FlipkartDatabaseConnector(self.stamp)
133+
print(self.db.welcomeMessage)
134+
135+
with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
136+
productUrls = executor.map(self.getProductList, product_categories)
137+
productList.extend(productUrls)
138+
139+
140+
# flatten the list productList
141+
productList = [item for sublist in productList for item in sublist]
142+
print(f'Total products for {self.brand} is {len(productList)}')
143+
144+
with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
145+
results = executor.map(self.getProductDetails, productList)
146+
147+
for result in results:
148+
print(f"Saving {result['sku']} to db")
149+
self.db.insertProduct(result)
150+
151+
self.db.removeDuplicates()
152+
153+
154+
if __name__ == '__main__':
155+
scraper = Scraper()
156+
scraper.start()

FlipkartScraper/productList.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
product_categories = [
2+
'mobiles',
3+
]

FlipkartScraper/requirements.txt

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,10 @@
1-
jupyter
2-
scikit-learn
3-
pandas
4-
numpy
5-
matplotlib
6-
seaborn
7-
tensorflow
8-
flask
9-
openai
101
bs4
112
requests
123
pandas
13-
requests
144
numpy
155
bs4
16-
geopy
176
boto3
187
ndjson
198
selenium
209
httpx
2110
lxml
22-
python-dotenv
23-
paramiko
24-
undetected-chromedriver
25-
fastjsonschema

0 commit comments

Comments
 (0)