Skip to content

Commit 3f3ee05

Browse files
Beginners series - part 5
1 parent 020197b commit 3f3ee05

File tree

10 files changed

+332
-0
lines changed

10 files changed

+332
-0
lines changed

1_Beginner_Series/part_5__deployment_scheduling_monitoring/chocolatescraper/__init__.py

Whitespace-only changes.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from itemloaders.processors import TakeFirst, MapCompose
2+
from scrapy.loader import ItemLoader
3+
4+
5+
class ChocolateProductLoader(ItemLoader):
6+
7+
default_output_processor = TakeFirst()
8+
9+
price_in = MapCompose(lambda x: x.split("£")[-1])
10+
url_in = MapCompose(lambda x: 'https://www.chocolate.co.uk' + x )
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import scrapy
2+
3+
class ChocolateProduct(scrapy.Item):
4+
name = scrapy.Field()
5+
price = scrapy.Field()
6+
url = scrapy.Field()
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# Define here the models for your spider middleware
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5+
6+
from scrapy import signals
7+
8+
# useful for handling different item types with a single interface
9+
from itemadapter import is_item, ItemAdapter
10+
11+
12+
class ChocolatescraperSpiderMiddleware:
13+
# Not all methods need to be defined. If a method is not defined,
14+
# scrapy acts as if the spider middleware does not modify the
15+
# passed objects.
16+
17+
@classmethod
18+
def from_crawler(cls, crawler):
19+
# This method is used by Scrapy to create your spiders.
20+
s = cls()
21+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22+
return s
23+
24+
def process_spider_input(self, response, spider):
25+
# Called for each response that goes through the spider
26+
# middleware and into the spider.
27+
28+
# Should return None or raise an exception.
29+
return None
30+
31+
def process_spider_output(self, response, result, spider):
32+
# Called with the results returned from the Spider, after
33+
# it has processed the response.
34+
35+
# Must return an iterable of Request, or item objects.
36+
for i in result:
37+
yield i
38+
39+
def process_spider_exception(self, response, exception, spider):
40+
# Called when a spider or process_spider_input() method
41+
# (from other spider middleware) raises an exception.
42+
43+
# Should return either None or an iterable of Request or item objects.
44+
pass
45+
46+
def process_start_requests(self, start_requests, spider):
47+
# Called with the start requests of the spider, and works
48+
# similarly to the process_spider_output() method, except
49+
# that it doesn’t have a response associated.
50+
51+
# Must return only requests (not items).
52+
for r in start_requests:
53+
yield r
54+
55+
def spider_opened(self, spider):
56+
spider.logger.info('Spider opened: %s' % spider.name)
57+
58+
59+
class ChocolatescraperDownloaderMiddleware:
60+
# Not all methods need to be defined. If a method is not defined,
61+
# scrapy acts as if the downloader middleware does not modify the
62+
# passed objects.
63+
64+
@classmethod
65+
def from_crawler(cls, crawler):
66+
# This method is used by Scrapy to create your spiders.
67+
s = cls()
68+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69+
return s
70+
71+
def process_request(self, request, spider):
72+
# Called for each request that goes through the downloader
73+
# middleware.
74+
75+
# Must either:
76+
# - return None: continue processing this request
77+
# - or return a Response object
78+
# - or return a Request object
79+
# - or raise IgnoreRequest: process_exception() methods of
80+
# installed downloader middleware will be called
81+
return None
82+
83+
def process_response(self, request, response, spider):
84+
# Called with the response returned from the downloader.
85+
86+
# Must either;
87+
# - return a Response object
88+
# - return a Request object
89+
# - or raise IgnoreRequest
90+
return response
91+
92+
def process_exception(self, request, exception, spider):
93+
# Called when a download handler or a process_request()
94+
# (from other downloader middleware) raises an exception.
95+
96+
# Must either:
97+
# - return None: continue processing this exception
98+
# - return a Response object: stops process_exception() chain
99+
# - return a Request object: stops process_exception() chain
100+
pass
101+
102+
def spider_opened(self, spider):
103+
spider.logger.info('Spider opened: %s' % spider.name)
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from itemadapter import ItemAdapter
2+
from scrapy.exceptions import DropItem
3+
4+
## Storing to DB
5+
import mysql.connector ## MySQL
6+
import psycopg2 ## Postgres
7+
8+
class PriceToUSDPipeline:
9+
10+
gbpToUsdRate = 1.3
11+
12+
def process_item(self, item, spider):
13+
adapter = ItemAdapter(item)
14+
if adapter.get('price'):
15+
16+
#converting the price to a float
17+
floatPrice = float(adapter['price'])
18+
19+
#converting the price from gbp to usd using our hard coded exchange rate
20+
adapter['price'] = floatPrice * self.gbpToUsdRate
21+
22+
return item
23+
else:
24+
raise DropItem(f"Missing price in {item}")
25+
26+
27+
class DuplicatesPipeline:
28+
29+
def __init__(self):
30+
self.names_seen = set()
31+
32+
def process_item(self, item, spider):
33+
adapter = ItemAdapter(item)
34+
if adapter['name'] in self.names_seen:
35+
raise DropItem(f"Duplicate item found: {item!r}")
36+
else:
37+
self.names_seen.add(adapter['name'])
38+
return item
39+
40+
class SavingToMySQLPipeline(object):
41+
42+
def __init__(self):
43+
self.create_connection()
44+
45+
def create_connection(self):
46+
self.conn = mysql.connector.connect(
47+
host = 'localhost',
48+
user = 'root',
49+
password = '123456',
50+
database = 'chocolate_scraping'
51+
)
52+
self.curr = self.conn.cursor()
53+
54+
def process_item(self, item, spider):
55+
self.store_db(item)
56+
#we need to return the item below as Scrapy expects us to!
57+
return item
58+
59+
def store_in_db(self, item):
60+
self.curr.execute(""" insert into chocolate_products values (%s,%s,%s)""", (
61+
item["title"][0],
62+
item["price"][0],
63+
item["url"][0]
64+
))
65+
self.conn.commit()
66+
67+
68+
class SavingToPostgresPipeline(object):
69+
70+
def __init__(self):
71+
self.create_connection()
72+
73+
74+
def create_connection(self):
75+
conn = psycopg2.connect(
76+
host="localhost",
77+
database="chocolate_scraping",
78+
user="root",
79+
password="123456")
80+
81+
82+
def process_item(self, item, spider):
83+
self.store_db(item)
84+
#we need to return the item below as scrapy expects us to!
85+
return item
86+
87+
def store_in_db(self, item):
88+
self.curr.execute(""" insert into chocolate_products values (%s,%s,%s)""", (
89+
item["title"][0],
90+
item["price"][0],
91+
item["url"][0]
92+
))
93+
self.conn.commit()
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Scrapy settings for chocolatescraper project
2+
#
3+
# For simplicity, this file contains only settings considered important or
4+
# commonly used. You can find more settings consulting the documentation:
5+
#
6+
# https://docs.scrapy.org/en/latest/topics/settings.html
7+
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9+
10+
BOT_NAME = 'chocolatescraper'
11+
12+
SPIDER_MODULES = ['chocolatescraper.spiders']
13+
NEWSPIDER_MODULE = 'chocolatescraper.spiders'
14+
15+
16+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
17+
#USER_AGENT = 'chocolatescraper (+http://www.yourdomain.com)'
18+
19+
# Obey robots.txt rules
20+
ROBOTSTXT_OBEY = True
21+
22+
23+
# To Storing in AWS S3 Bucket
24+
AWS_ACCESS_KEY_ID = 'myaccesskeyhere'
25+
AWS_SECRET_ACCESS_KEY = 'mysecretkeyhere'
26+
27+
28+
29+
# Configure item pipelines
30+
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
31+
ITEM_PIPELINES = {
32+
'chocolatescraper.pipelines.PriceToUSDPipeline': 100,
33+
'chocolatescraper.pipelines.DuplicatesPipeline': 200,
34+
# 'chocolatescraper.pipelines.SavingToMySQLPipeline': 300,
35+
# 'chocolatescraper.pipelines.SavingToPostgresPipeline': 300,
36+
}
37+
38+
# Add Your ScrapeOps API key
39+
SCRAPEOPS_API_KEY = 'YOUR-API-KEY-HERE'
40+
41+
42+
# Add In The ScrapeOps Extension
43+
EXTENSIONS = {
44+
'scrapeops_scrapy.extension.ScrapeOpsMonitor': 500,
45+
}
46+
47+
48+
49+
DOWNLOADER_MIDDLEWARES = {
50+
51+
## Rotating User Agents
52+
# 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
53+
# 'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
54+
55+
## Rotating Free Proxies
56+
# 'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
57+
# 'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
58+
59+
'scrapeops_scrapy.middleware.retry.RetryMiddleware': 550,
60+
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
61+
}
62+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import scrapy
2+
from chocolatescraper.itemloaders import ChocolateProductLoader
3+
from chocolatescraper.items import ChocolateProduct
4+
from urllib.parse import urlencode
5+
6+
API_KEY = 'YOUR_API_KEY'
7+
8+
def get_proxy_url(url):
9+
payload = {'api_key': API_KEY, 'url': url}
10+
proxy_url = 'https://proxy.scrapeops.io/v1/?' + urlencode(payload)
11+
return proxy_url
12+
13+
class ChocolateSpider(scrapy.Spider):
14+
15+
# The name of the spider
16+
name = 'chocolatespider'
17+
18+
# These are the urls that we will start scraping
19+
def start_requests(self):
20+
start_url = 'https://www.chocolate.co.uk/collections/all'
21+
yield scrapy.Request(url=get_proxy_url(start_url), callback=self.parse)
22+
23+
24+
def parse(self, response):
25+
products = response.css('product-item')
26+
27+
for product in products:
28+
chocolate = ChocolateProductLoader(item=ChocolateProduct(), selector=product)
29+
chocolate.add_css('name', "a.product-item-meta__title::text")
30+
chocolate.add_css('price', 'span.price', re='<span class="price">\n <span class="visually-hidden">Sale price</span>(.*)</span>')
31+
chocolate.add_css('url', 'div.product-item-meta a::attr(href)')
32+
yield chocolate.load_item()
33+
34+
next_page = response.css('[rel="next"] ::attr(href)').get()
35+
36+
if next_page is not None:
37+
next_page_url = 'https://www.chocolate.co.uk' + next_page
38+
yield response.follow(get_proxy_url(next_page_url), callback=self.parse)

1_Beginner_Series/part_5__deployment_scheduling_monitoring/requirements.txt

Whitespace-only changes.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
5+
6+
[scrapyd]
7+
application = scrapy_heroku.app.application
8+
9+
10+
[settings]
11+
default = chocolatescraper.settings
12+
13+
[deploy]
14+
url = http://chocolate-scraper-tut.herokuapp.com:80/
15+
project = chocolatescraper
16+

0 commit comments

Comments
 (0)