Skip to content

Commit 8aa59fa

Browse files
authored
Merge branch 'master' into update-dalle
2 parents 126460f + 05b444e commit 8aa59fa

File tree

14 files changed

+557
-6
lines changed

14 files changed

+557
-6
lines changed
Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
click==8.1.3
2-
Flask==2.1.2
3-
itsdangerous==2.1.2
4-
Jinja2==3.1.2
5-
MarkupSafe==2.1.1
6-
Werkzeug==2.1.2
1+
blinker==1.8.2
2+
click==8.1.7
3+
Flask==3.0.3
4+
itsdangerous==2.2.0
5+
Jinja2==3.1.4
6+
MarkupSafe==2.1.5
7+
Werkzeug==3.0.3
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Web Scraping With Scrapy and MongoDB
2+
3+
[Web Scraping With Scrapy and MongoDB](https://realpython.com/web-scraping-with-scrapy-and-mongodb/) is an example project for building a robust web scraper for static sites leveraging Scrapy and MongoDB.
4+
5+
## Installation and Setup
6+
7+
1. Create a Python virtual environment
8+
9+
```sh
10+
$ python -m venv ./venv
11+
$ source venv/bin/activate
12+
(venv) $
13+
```
14+
15+
2. Install the requirements
16+
17+
```sh
18+
(venv) $ pip install -r requirements.txt
19+
```
20+
21+
You'll also need to [set up a MongoDB collection](https://realpython.com/web-scraping-with-scrapy-and-mongodb/#set-up-a-mongodb-collection-on-your-computer) like described in the tutorial.
22+
23+
## Run the Scraper
24+
25+
Navigate into the `books/` project directory.
26+
27+
Then you can start crawling the site:
28+
29+
```sh
30+
(venv) $ scrapy crawl book
31+
```
32+
33+
If set up correctly, this will populate your MongoDB collection with the book information scraped from the example site.
34+
35+
## About the Author
36+
37+
Martin Breuss - Email: [email protected]
38+
39+
## License
40+
41+
Distributed under the MIT license. See ``LICENSE`` for more information.

web-scraping-with-scrapy-and-mongodb/books/books/__init__.py

Whitespace-only changes.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import scrapy
2+
3+
4+
class BooksItem(scrapy.Item):
5+
_id = scrapy.Field()
6+
url = scrapy.Field()
7+
title = scrapy.Field()
8+
price = scrapy.Field()
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# Define here the models for your spider middleware
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5+
6+
# useful for handling different item types with a single interface
7+
from scrapy import signals
8+
9+
10+
class BooksSpiderMiddleware:
11+
# Not all methods need to be defined. If a method is not defined,
12+
# scrapy acts as if the spider middleware does not modify the
13+
# passed objects.
14+
15+
@classmethod
16+
def from_crawler(cls, crawler):
17+
# This method is used by Scrapy to create your spiders.
18+
s = cls()
19+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
20+
return s
21+
22+
def process_spider_input(self, response, spider):
23+
# Called for each response that goes through the spider
24+
# middleware and into the spider.
25+
26+
# Should return None or raise an exception.
27+
return None
28+
29+
def process_spider_output(self, response, result, spider):
30+
# Called with the results returned from the Spider, after
31+
# it has processed the response.
32+
33+
# Must return an iterable of Request, or item objects.
34+
for i in result:
35+
yield i
36+
37+
def process_spider_exception(self, response, exception, spider):
38+
# Called when a spider or process_spider_input() method
39+
# (from other spider middleware) raises an exception.
40+
41+
# Should return either None or an iterable of Request or item objects.
42+
pass
43+
44+
def process_start_requests(self, start_requests, spider):
45+
# Called with the start requests of the spider, and works
46+
# similarly to the process_spider_output() method, except
47+
# that it doesn’t have a response associated.
48+
49+
# Must return only requests (not items).
50+
for r in start_requests:
51+
yield r
52+
53+
def spider_opened(self, spider):
54+
spider.logger.info("Spider opened: %s" % spider.name)
55+
56+
57+
class BooksDownloaderMiddleware:
58+
# Not all methods need to be defined. If a method is not defined,
59+
# scrapy acts as if the downloader middleware does not modify the
60+
# passed objects.
61+
62+
@classmethod
63+
def from_crawler(cls, crawler):
64+
# This method is used by Scrapy to create your spiders.
65+
s = cls()
66+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
67+
return s
68+
69+
def process_request(self, request, spider):
70+
# Called for each request that goes through the downloader
71+
# middleware.
72+
73+
# Must either:
74+
# - return None: continue processing this request
75+
# - or return a Response object
76+
# - or return a Request object
77+
# - or raise IgnoreRequest: process_exception() methods of
78+
# installed downloader middleware will be called
79+
return None
80+
81+
def process_response(self, request, response, spider):
82+
# Called with the response returned from the downloader.
83+
84+
# Must either;
85+
# - return a Response object
86+
# - return a Request object
87+
# - or raise IgnoreRequest
88+
return response
89+
90+
def process_exception(self, request, exception, spider):
91+
# Called when a download handler or a process_request()
92+
# (from other downloader middleware) raises an exception.
93+
94+
# Must either:
95+
# - return None: continue processing this exception
96+
# - return a Response object: stops process_exception() chain
97+
# - return a Request object: stops process_exception() chain
98+
pass
99+
100+
def spider_opened(self, spider):
101+
spider.logger.info("Spider opened: %s" % spider.name)
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import hashlib
2+
3+
import pymongo
4+
from itemadapter import ItemAdapter
5+
from scrapy.exceptions import DropItem
6+
7+
8+
class MongoPipeline:
9+
COLLECTION_NAME = "books"
10+
11+
def __init__(self, mongo_uri, mongo_db):
12+
self.mongo_uri = mongo_uri
13+
self.mongo_db = mongo_db
14+
15+
@classmethod
16+
def from_crawler(cls, crawler):
17+
return cls(
18+
mongo_uri=crawler.settings.get("MONGO_URI"),
19+
mongo_db=crawler.settings.get("MONGO_DATABASE"),
20+
)
21+
22+
def open_spider(self, spider):
23+
self.client = pymongo.MongoClient(self.mongo_uri)
24+
self.db = self.client[self.mongo_db]
25+
26+
def close_spider(self, spider):
27+
self.client.close()
28+
29+
def process_item(self, item, spider):
30+
item_id = self.compute_item_id(item)
31+
if self.db[self.COLLECTION_NAME].find_one({"_id": item_id}):
32+
raise DropItem(f"Duplicate item found: {item}")
33+
else:
34+
item["_id"] = item_id
35+
self.db[self.COLLECTION_NAME].insert_one(
36+
ItemAdapter(item).asdict()
37+
)
38+
return item
39+
40+
def compute_item_id(self, item):
41+
url = item["url"]
42+
return hashlib.sha256(url.encode("utf-8")).hexdigest()
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Scrapy settings for books project
2+
#
3+
# For simplicity, this file contains only settings considered important or
4+
# commonly used. You can find more settings consulting the documentation:
5+
#
6+
# https://docs.scrapy.org/en/latest/topics/settings.html
7+
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9+
10+
BOT_NAME = "books"
11+
12+
SPIDER_MODULES = ["books.spiders"]
13+
NEWSPIDER_MODULE = "books.spiders"
14+
15+
# Obey robots.txt rules
16+
ROBOTSTXT_OBEY = True
17+
18+
# Configure item pipelines
19+
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
20+
ITEM_PIPELINES = {
21+
"books.pipelines.MongoPipeline": 300,
22+
}
23+
24+
# Set settings whose default value is deprecated to a future-proof value
25+
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
26+
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
27+
FEED_EXPORT_ENCODING = "utf-8"
28+
29+
MONGO_URI = "mongodb://localhost:27017"
30+
MONGO_DATABASE = "books_db"
31+
32+
LOG_LEVEL = "WARNING"
33+
LOG_FILE = "book_scraper.log"
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import scrapy
2+
3+
from books.items import BooksItem
4+
5+
6+
class BookSpider(scrapy.Spider):
7+
name = "book"
8+
allowed_domains = ["books.toscrape.com"]
9+
start_urls = ["https://books.toscrape.com/"]
10+
11+
def start_requests(self):
12+
for url in self.start_urls:
13+
yield scrapy.Request(
14+
url, callback=self.parse, errback=self.log_error
15+
)
16+
17+
def parse(self, response):
18+
"""
19+
@url https://books.toscrape.com
20+
@returns items 20 20
21+
@returns request 1 50
22+
@scrapes url title price
23+
"""
24+
for book in response.css("article.product_pod"):
25+
item = BooksItem()
26+
item["url"] = book.css("h3 > a::attr(href)").get()
27+
item["title"] = book.css("h3 > a::attr(title)").get()
28+
item["price"] = book.css(".price_color::text").get()
29+
yield item
30+
31+
next_page = response.css("li.next > a::attr(href)").get()
32+
if next_page:
33+
next_page_url = response.urljoin(next_page)
34+
self.logger.info(
35+
f"Navigating to next page with URL {next_page_url}."
36+
)
37+
yield scrapy.Request(
38+
url=next_page_url,
39+
callback=self.parse,
40+
errback=self.log_error,
41+
)
42+
43+
def log_error(self, failure):
44+
self.logger.error(repr(failure))
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
5+
6+
[settings]
7+
default = books.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = books

0 commit comments

Comments
 (0)