Skip to content

Commit 2a0cf5a

Browse files
Merge pull request #1478 from paritoshtripathi935/flipkartScraper
added flipkart scraper for gssoc'23 contribution
2 parents 18eb082 + 2b88cb2 commit 2a0cf5a

File tree

8 files changed

+439
-0
lines changed

8 files changed

+439
-0
lines changed

FlipkartScraper/README.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Flipkart Scraper
2+
## This is a simple scraper designed to extract product information from Flipkart, an e-commerce platform. The scraper is written in Python and consists of the following files:
3+
4+
1. dbConnector.py: This file contains the code for connecting to a database and performing database operations related to storing the scraped data.
5+
6+
2. genericHtmlib.py: This file provides a set of generic functions and utilities for parsing HTML and extracting data from web pages.
7+
8+
3. main.py: This is the main entry point of the scraper. It initializes the necessary components and orchestrates the scraping process.
9+
10+
4. productList.py: container categories of list that you want to scrape.
11+
12+
5. pycache: This directory contains the compiled bytecode of the Python files for faster execution. You can safely ignore this directory.
13+
14+
6. useragent.py: This file defines the User-Agent string that the scraper uses for making HTTP requests. It helps mimic the behavior of a real web browser.
15+
16+
## To use the Flipkart scraper, follow these steps:
17+
18+
Make sure you have Python installed on your system.
19+
- create a virtual env by running the following command:
20+
```
21+
python3 -m venv venv
22+
```
23+
24+
Install the required dependencies by running the following command:
25+
```
26+
pip install -r requirements.txt
27+
```
28+
29+
- open productList.py and add the categories of list that you want to scrape.
30+
31+
Execute the scraper by running the following command:
32+
33+
```
34+
python main.py
35+
```
36+
37+
The scraper will start processing the product URLs one by one, extracting relevant information such as the product name, price, description, and any other details specified in the code. The scraped data will be stored in the configured database or output format.
38+
39+
Please note that web scraping should be done responsibly and in compliance with the terms and conditions of the target website. Make sure to respect the website's policies regarding scraping frequency and data usage.
40+
41+
If you encounter any issues or have any questions, feel free to open an issue or reach out to the project maintainer.
42+
43+
Built with ❤️ by [Paritosh Tripathi](https://github.com/paritoshtripathi935)

FlipkartScraper/dbConnector.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import sqlite3
2+
import os
3+
4+
class FlipkartDatabaseConnector:
5+
def __init__(self, stamp):
6+
self.dbPath = "flipkart.db"
7+
self.conn = sqlite3.connect(self.dbPath)
8+
self.cur = self.conn.cursor()
9+
self.welcomeMessage = "Welcome to Flipkart Scraper. This is the database for the Flipkart Scraper. This database was created on {}.".format(stamp)
10+
11+
def schemaMaker(self):
12+
# creating tables
13+
self.cur.execute("""CREATE TABLE products (
14+
id INTEGER PRIMARY KEY AUTOINCREMENT,
15+
sku TEXT NOT NULL,
16+
name TEXT NOT NULL,
17+
description TEXT NOT NULL,
18+
image_path TEXT NOT NULL,
19+
category TEXT NOT NULL,
20+
timestamp TEXT NOT NULL,
21+
URL TEXT NOT NULL,
22+
price TEXT NOT NULL
23+
);""")
24+
self.conn.commit()
25+
self.cur.execute("CREATE TABLE product_matches (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, product_sku INTEGER NOT NULL, match_id INTEGER NOT NULL, match_sku INTEGER NOT NULL);")
26+
self.conn.commit()
27+
28+
def insertProduct(self, productDetails):
29+
self.cur.execute("INSERT INTO products (sku, name, description, image_path, category, timestamp, URL, price) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (productDetails["sku"], productDetails["name"], productDetails["description"], productDetails["image_path"], productDetails["category"], productDetails["timestamp"], productDetails["URL"], productDetails["price"]))
30+
self.conn.commit()
31+
32+
def fetchAllProducts(self):
33+
self.cur.execute("SELECT * FROM products")
34+
return self.cur.fetchall()
35+
36+
def clearDatabase(self):
37+
self.cur.execute("DELETE FROM products")
38+
self.conn.commit()
39+
self.cur.execute("DELETE FROM product_matches")
40+
self.conn.commit()
41+
42+
def removeDuplicates(self):
43+
self.cur.execute("DELETE FROM products WHERE rowid NOT IN (SELECT MIN(rowid) FROM products GROUP BY sku)")
44+
self.conn.commit()

FlipkartScraper/flipkart.db

16 MB
Binary file not shown.

FlipkartScraper/genricHtmlib.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
from multiprocessing import Pool
2+
import os
3+
from datetime import datetime
4+
import lxml.html as html
5+
import pandas as pd
6+
import time
7+
from selenium import webdriver
8+
from selenium.webdriver.chrome.options import Options
9+
import warnings
10+
import requests
11+
warnings.filterwarnings("ignore")
12+
13+
class SeleniumScraper:
14+
def __init__(self, timeout=10):
15+
self.timeout = timeout
16+
self.reqSession = requests.Session()
17+
self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
18+
self.storagePath = os.path.join(
19+
os.path.dirname(os.path.abspath(__file__))
20+
)
21+
22+
self.headers = {
23+
'authority': 'www.amazon.com',
24+
'pragma': 'no-cache',
25+
'cache-control': 'no-cache',
26+
'dnt': '1',
27+
'upgrade-insecure-requests': '1',
28+
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
29+
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
30+
'sec-fetch-site': 'none',
31+
'sec-fetch-mode': 'navigate',
32+
'sec-fetch-dest': 'document',
33+
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
34+
}
35+
36+
def fetch_request_normal(self, url, params=None):
37+
try:
38+
headers = {
39+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
40+
}
41+
response = self.reqSession.get(url, headers=headers)
42+
43+
if response.status_code == 200:
44+
return response.text
45+
46+
if response.status_code == 301:
47+
# retry with redirect
48+
response = requests.get(response.headers['Location'])
49+
response.raise_for_status()
50+
if response.status_code == 200:
51+
return response.text
52+
53+
if response.status_code == 503:
54+
#print("Request Failed Response status code for url: {} and status code: {}".format(url, 503))
55+
return None
56+
57+
except Exception as e:
58+
print(
59+
"Exception occurred for url: {} and exception: {}".format(url, e)
60+
)
61+
print("Exception occurred for url: {} and exception: {}".format(url, e))
62+
pass
63+
return None
64+
65+
def get_xpath_link(self, doc, xpath, website):
66+
try:
67+
name = doc.xpath("".join(xpath))
68+
for i in range(len(name)):
69+
if name[i].startswith("/"):
70+
name[i] = website + name[i]
71+
else:
72+
name[i] = name[i]
73+
return name
74+
75+
except Exception as e:
76+
print("Error in getting {}: {}".format(name, e))
77+
pass
78+
return None
79+
pass
80+
81+
def get_selenium_driver(self):
82+
chrome_options = Options()
83+
chrome_options.add_argument("--headless")
84+
chrome_options.add_argument("--window-size=1920,1080")
85+
chrome_options.add_argument("--disable-gpu")
86+
chrome_options.add_argument("--no-sandbox")
87+
chrome_options.add_argument("--disable-dev-shm-usage")
88+
chrome_options.add_argument("--disable-extensions")
89+
chrome_options.add_argument("--disable-logging")
90+
chrome_options.add_argument("--log-level=3")
91+
chrome_options.add_argument("--silent")
92+
chrome_options.add_argument("--blink-settings=imagesEnabled=false")
93+
driver = webdriver.Chrome(chrome_options=chrome_options)
94+
return driver
95+
96+
def fetch_request_selenium(self, url, waiting_time=1):
97+
try:
98+
driver = self.get_selenium_driver()
99+
driver.get(url)
100+
time.sleep(waiting_time)
101+
doc = html.fromstring(driver.page_source)
102+
driver.close()
103+
return doc
104+
105+
except Exception as e:
106+
print(
107+
"Exception occurred for url: {} and exception: {}".format(url, e)
108+
)
109+
pass
110+
111+
def get_xpath_data(self, doc, xpath):
112+
try:
113+
name = doc.xpath(xpath)
114+
return name
115+
116+
except Exception as e:
117+
print("Error in getting {}: {}".format(name, e))
118+
pass
119+
return None
120+
121+
def slow_page_scroll(self, driver, speed):
122+
current_scroll_position = driver.execute_script("return window.pageYOffset;")
123+
while current_scroll_position < driver.execute_script(
124+
"return document.body.scrollHeight;"
125+
):
126+
driver.execute_script(
127+
"window.scrollTo(0, arguments[0]);", current_scroll_position
128+
)
129+
current_scroll_position += 1000
130+
time.sleep(speed)
131+
132+
def data_storage(self, df_list, unique_id, name, storageFormat, storagePath=None):
133+
df_combined = pd.concat(df_list, ignore_index=True)
134+
df_combined.drop_duplicates(subset=unique_id, inplace=True)
135+
if storageFormat == "csv":
136+
df_combined.to_csv(
137+
self.storagePath +"/{}_{}.csv".format(name, self.stamp),
138+
index=False,
139+
)
140+
elif storageFormat == "json":
141+
df_combined.to_json(
142+
self.storagePath + "/{}_{}.json".format(name, self.stamp),
143+
orient="records",
144+
)
145+
146+
def cleanData(self, array):
147+
array = [x.strip() for x in array]
148+
array = list(filter(None, array))
149+
array = [x.encode("ascii", "ignore").decode() for x in array]
150+
array = [x.replace("\n", "") for x in array]
151+
return array
152+
153+

0 commit comments

Comments
 (0)